In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
import statsmodels
from statsmodels.formula.api import ols

In [2]:
df = pd.read_csv('data/testing_data.csv')

In [3]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,Seattle,Tukwila,Shoreline,Burien,Des Moines,Normandy Park,Seatac,Lake Forest Park,has_basement,recently_renovated
0,221900.0,3,1.00,1180,5650,1.0,0,NONE,Average,7 Average,...,1,1,0,0,0,0,0,0,0,0
1,538000.0,3,2.25,2570,7242,2.0,0,NONE,Average,7 Average,...,1,0,0,0,0,0,0,0,1,0
2,180000.0,2,1.00,770,10000,1.0,0,NONE,Average,6 Low Average,...,0,0,0,0,0,0,0,0,0,0
3,604000.0,4,3.00,1960,5000,1.0,0,NONE,Very Good,7 Average,...,1,0,0,0,0,0,0,0,1,0
4,510000.0,3,2.00,1680,8080,1.0,0,NONE,Average,8 Good,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21394,360000.0,3,2.50,1530,1131,3.0,0,NONE,Average,8 Good,...,1,0,0,0,0,0,0,0,0,0
21395,400000.0,4,2.50,2310,5813,2.0,0,NONE,Average,8 Good,...,1,0,0,1,0,0,0,0,0,0
21396,402101.0,2,0.75,1020,1350,2.0,0,NONE,Average,7 Average,...,1,0,0,0,0,0,0,0,0,0
21397,400000.0,3,2.50,1600,2388,2.0,0,NONE,Average,8 Good,...,0,0,0,0,0,0,0,0,0,0


In [5]:
cities = {'Algona': [98001], 'Auburn': [98001, 98002, 98003, 98023, 98063, 98071, 98092, 98093], 'Federal Way': [98001, 98003, 98023, 98063, 98093],
         'Beaux Arts Village': [98004], 'Bellevue':[98004, 98005, 98006, 98007, 98008, 98009, 98015], 
          'Clyde Hill': [98004], 'Hunts Point': [98004], 'Yarrow Point': [98004], 'Black Diamond': [98010],
         'Bothell': [98011, 98041, 98028], 'Burton': [98013], 'Vashion': [98013], 'Carnation': [98014],
         'Duvall':[98019], 'Enumclaw': [98022], 'Fall City': [98024], 'Hobart': [98025], 
         'Issaquah': [98027], 'Kent': [98030, 98031, 98032, 98035, 98042, 98064], 'Kirkland':[98033, 98034, 98083],
         'Maple Valley': [98038], 'Medina': [98039], 'Mercer Island': [98040], 'Kenmore': [98028],
         'Covington':[98042], 'North Bend': [98045], 'Pacific':[98047], 'Preston':[98050],
         'Ravensdale':[98051], 'Redmond': [98052, 98053, 98073, 98074], 'Redondo': [98054], 'Renton': [98055, 98056, 98057, 98058, 98059],
         'Newcastle': [98056, 98059,], 'Seahurst':[98062], 'Snoqualmie': [98065, 98068], 'Snoqualmie Pass': [98068],
         'Vashon': [98070], 'Woodinville': [98072], 'Sammamish':[98075, 98075], 'Issaquah': [98075, 98027, 98029],
         'Seattle': [98101, 98102, 98103, 98104, 98105, 98106, 98107, 98108, 98109, 98111, 98112, 98114, 98115, 98116, 98117, 98118, 98119, 98121, 98122, 98124, 98125, 
                     98126, 98131, 98132, 98133, 98134, 98136, 98138, 98144, 98145, 98146, 98148, 98154, 98155, 98158, 98160, 98161, 98164, 98166, 98168, 98171, 98174, 98177, 98178, 98188, 98198, 98199 ], 
          'Tukwila': [98108, 98138, 98168, 98178, 98188],
         'Shoreline': [98133, 98155, 98177 ], 'Burien': [98146, 98148, 98166, 98168 ], 'Des Moines' : [98148, 98198], 'Normandy Park': [98148, 98166, 98198], 'Seatac': [98148, 98158, 98168, 98188, 98198], 
          'Lake Forest Park': [98155, 98155, 98155], 'Baring': [98224], 'Skykomish': [98288]}

In [6]:
data = {}

for i in cities.keys():
    if i in df.columns:
        data[i] = df[df[i] == 1]
    else:
        continue

In [7]:
data.keys()

dict_keys(['Algona', 'Auburn', 'Federal Way', 'Beaux Arts Village', 'Bellevue', 'Clyde Hill', 'Hunts Point', 'Yarrow Point', 'Black Diamond', 'Bothell', 'Carnation', 'Duvall', 'Enumclaw', 'Fall City', 'Issaquah', 'Kent', 'Kirkland', 'Maple Valley', 'Medina', 'Mercer Island', 'Kenmore', 'Covington', 'North Bend', 'Redmond', 'Renton', 'Newcastle', 'Snoqualmie', 'Vashon', 'Woodinville', 'Sammamish', 'Seattle', 'Tukwila', 'Shoreline', 'Burien', 'Des Moines', 'Normandy Park', 'Seatac', 'Lake Forest Park'])

In [8]:
df.columns
outliers = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15',
           'has_basement', 'recently_renovated']

In [9]:
for i in data.keys():
    print("{}".format(i))
    print(data[i].loc[:, outliers].describe())
    print('       ')

Algona
               price    bedrooms   bathrooms  sqft_living       sqft_lot  \
count     361.000000  361.000000  361.000000   361.000000     361.000000   
mean   281194.869806    3.393352    2.011773  1903.783934   14967.002770   
std     98837.430520    0.726758    0.609274   653.794856   21213.757025   
min    100000.000000    1.000000    0.750000   770.000000    2064.000000   
25%    215000.000000    3.000000    1.500000  1440.000000    7245.000000   
50%    260000.000000    3.000000    2.000000  1824.000000    9292.000000   
75%    320000.000000    4.000000    2.500000  2240.000000   14439.000000   
max    850000.000000    6.000000    3.500000  5440.000000  239580.000000   

           floors  waterfront   sqft_above  sqft_basement  sqft_living15  \
count  361.000000       361.0   361.000000     361.000000     361.000000   
mean     1.430748         0.0  1723.756233     180.027701    1830.099723   
std      0.490942         0.0   630.886203     366.819528     494.591617   
min 

              price    bedrooms   bathrooms  sqft_living       sqft_lot  \
count  4.780000e+02  478.000000  478.000000   478.000000     478.000000   
mean   4.738659e+05    3.508368    2.198222  2175.910042   11681.941423   
std    1.394898e+05    0.810656    0.587975   699.962133   13668.097927   
min    1.600000e+05    1.000000    0.750000   720.000000    1116.000000   
25%    3.900000e+05    3.000000    1.750000  1660.000000    7210.000000   
50%    4.521250e+05    3.000000    2.250000  2150.000000    9602.000000   
75%    5.387500e+05    4.000000    2.500000  2590.000000   11924.000000   
max    1.600000e+06    6.000000    4.750000  4890.000000  209959.000000   

           floors  waterfront   sqft_above  sqft_basement  sqft_living15  \
count  478.000000  478.000000   478.000000     478.000000     478.000000   
mean     1.453975    0.002092  1851.861925     313.566946    2155.257322   
std      0.493113    0.045739   658.066584     458.286718     500.567923   
min      1.000000   

              price   bedrooms  bathrooms  sqft_living      sqft_lot  \
count  5.000000e+01  50.000000  50.000000      50.0000     50.000000   
mean   2.161300e+06   4.060000   3.200000    3800.9000  17403.560000   
std    1.166904e+06   0.890081   1.366509    1764.5025   6655.224175   
min    7.875000e+05   2.000000   1.000000    1220.0000   6572.000000   
25%    1.402500e+06   4.000000   2.250000    2680.0000  13797.500000   
50%    1.895000e+06   4.000000   3.000000    3560.0000  17188.500000   
75%    2.560000e+06   4.750000   3.687500    4452.5000  20052.250000   
max    6.890000e+06   7.000000   7.750000    9890.0000  35069.000000   

          floors  waterfront   sqft_above  sqft_basement  sqft_living15  \
count  50.000000   50.000000    50.000000       50.00000      50.000000   
mean    1.560000    0.020000  3290.900000      510.00000    3132.200000   
std     0.501427    0.141421  1595.431517      714.84264     701.535604   
min     1.000000    0.000000  1080.000000        0.

              price    bedrooms   bathrooms  sqft_living       sqft_lot  \
count  3.590000e+02  359.000000  359.000000   359.000000     359.000000   
mean   7.907347e+05    3.855153    2.735376  3016.370474   19002.281337   
std    2.855497e+05    0.694085    0.658233   845.034735   27631.439141   
min    4.064300e+05    1.000000    1.000000  1160.000000    1767.000000   
25%    6.420000e+05    3.000000    2.500000  2505.000000    6815.000000   
50%    7.399990e+05    4.000000    2.500000  2990.000000   10955.000000   
75%    8.675000e+05    4.000000    3.000000  3485.000000   18299.000000   
max    3.200000e+06    7.000000    6.500000  7000.000000  323215.000000   

           floors  waterfront   sqft_above  sqft_basement  sqft_living15  \
count  359.000000  359.000000   359.000000     359.000000     359.000000   
mean     1.809192    0.022284  2831.189415     180.389972    2980.130919   
std      0.418429    0.147812   889.439512     439.292681     612.109506   
min      1.000000   

In [10]:
df.lat, df.long

(0        47.5112
 1        47.7210
 2        47.7379
 3        47.5208
 4        47.6168
           ...   
 21394    47.6993
 21395    47.5107
 21396    47.5944
 21397    47.5345
 21398    47.5941
 Name: lat, Length: 21399, dtype: float64,
 0       -122.257
 1       -122.319
 2       -122.233
 3       -122.393
 4       -122.045
           ...   
 21394   -122.346
 21395   -122.362
 21396   -122.299
 21397   -122.069
 21398   -122.299
 Name: long, Length: 21399, dtype: float64)

In [11]:
from math import cos, sin, atan2, sqrt, pi

def center_geolocation(geolocations):
    """
    Provide a relatively accurate center lat, lon returned as a list pair, given
    a list of list pairs.
    ex: in: geolocations = ((lat1,lon1), (lat2,lon2),)
        out: (center_lat, center_lon)
    """
    x = 0
    y = 0
    z = 0

    for lat, lon in geolocations:
        lat = float(lat *(pi/180))
        lon = float(lon * (pi/180))
        x += cos(lat) * cos(lon)
        y += cos(lat) * sin(lon)
        z += sin(lat)

    x = float(x / len(geolocations))
    y = float(y / len(geolocations))
    z = float(z / len(geolocations))
    
    degrees1 = atan2(z, sqrt(x * x + y * y))
    degrees2 = atan2(y, x)

    return (degrees1 * (180/pi), degrees2 *(180/pi))

In [12]:
coordinates = {}

for i in data.keys():
    x = data[i]['lat']
    y = data[i]['long']
    pair = list(zip(x,y))
    
    coordinates[i] = pair
    
    

    

In [13]:
center_location = {}

for i in data.keys():
    center_location[i] = center_geolocation(coordinates[i])

In [14]:
for i in center_location.keys():
    print('{} : {}'.format(i, center_location[i]))

Algona : (47.309106343981384, -122.27070044933522)
Auburn : (47.30854005572448, -122.27676582103699)
Federal Way : (47.31053224165728, -122.32132927351802)
Beaux Arts Village : (47.616183826606125, -122.20518721213313)
Bellevue : (47.59390218225879, -122.15570714644898)
Clyde Hill : (47.616183826606125, -122.20518721213313)
Hunts Point : (47.616183826606125, -122.20518721213313)
Yarrow Point : (47.616183826606125, -122.20518721213313)
Black Diamond : (47.333218058574275, -121.99947283065686)
Bothell : (47.75516310891854, -122.22772946271448)
Carnation : (47.671627810177164, -121.84858639209433)
Duvall : (47.73710920443352, -121.95483585247659)
Enumclaw : (47.21123075623289, -121.99554880772615)
Fall City : (47.55876680494092, -121.90547261010268)
Issaquah : (47.554457705248, -122.02523399866831)
Kent : (47.375893695872826, -122.16518645535287)
Kirkland : (47.70273832807398, -122.19865397295318)
Maple Valley : (47.37064204472687, -122.03175526548821)
Medina : (47.6258401004965, -122.233

In [15]:
center_location.pop('Clyde Hill')
center_location.pop('Hunts Point')
center_location.pop('Yarrow Point')


(47.616183826606125, -122.20518721213313)

In [22]:
import folium

base_map = folium.Map([47.75486440772105, -122.30354127965661], zoom_start=13)

for i in center_location.keys():
        
        lat = center_location[i][0]
        long = center_location[i][1]
        marker = folium.Marker(location=[lat, long])
        marker.add_to(base_map)
        popup_text = "City: {}, Latitude: {}, Longitude: {}".format(i, lat,long)
        popup = folium.Popup(popup_text, parse_html=True)
        marker = folium.Marker(location=[lat, long], popup=popup, marker_color='green')
        marker.add_to(base_map)

base_map