In [1]:
# Import libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Import data
df = pd.read_csv('C:/Users/clynn/Documents/GitHub/geospatial-data-science/labs/lab5/seattle_house_prices.csv')
coast = gpd.read_file("C:/Users/clynn/Documents/GitHub/geospatial-data-science/labs/lab5/washington_coastline.shp")
waterbodies = gpd.read_file ("C:/Users/clynn/Documents/GitHub/geospatial-data-science/labs/lab5/Waterbodies_with_History_and_Jurisdictional_detail___wtrbdy_det_area.shp")
# Examine dataset (each row represents one block group)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,lat,long
0,538000,3,2.25,2570,7242,1951,47.721,-122.319
1,180000,2,1.0,770,10000,1933,47.7379,-122.233
2,604000,4,3.0,1960,5000,1965,47.5208,-122.393
3,510000,3,2.0,1680,8080,1987,47.6168,-122.045
4,1230000,4,4.5,5420,101930,2001,47.6561,-122.005


In [3]:
# Convert DataFrame to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['long'], df['lat']))
gdf = gdf.set_crs(4326, allow_override=True)

# Reproject everything to UTM 10N (EPSG:32610)
gdf_utm = gdf.to_crs('EPSG:32610')
coast_utm = coast.to_crs('EPSG:32610')
waterbodies_utm = waterbodies.to_crs('EPSG:32610')

In [7]:
# Check summary statistics
df.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,lat,long
count,19451.0,19451.0,19451.0,19451.0,19451.0,19451.0,19451.0,19451.0
mean,540463.4,3.369955,2.114943,2081.149967,15084.06,1971.021438,47.55978,-122.213896
std,368512.3,0.906639,0.769818,918.160494,41601.57,29.363143,0.138783,0.140694
min,75000.0,0.0,0.0,290.0,520.0,1900.0,47.1559,-122.519
25%,321000.0,3.0,1.75,1420.0,5042.5,1952.0,47.4695,-122.328
50%,450000.0,3.0,2.25,1920.0,7620.0,1975.0,47.5717,-122.231
75%,645000.0,4.0,2.5,2550.0,10665.5,1997.0,47.6779,-122.125
max,7700000.0,11.0,8.0,13540.0,1651359.0,2015.0,47.7776,-121.315


In [9]:
df.count()

price          19451
bedrooms       19451
bathrooms      19451
sqft_living    19451
sqft_lot       19451
yr_built       19451
lat            19451
long           19451
geometry       19451
dtype: int64

1) There are 19,451 houses in the seattle database. 

In [10]:
# Examine dataset (each row represents one block group)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,lat,long,geometry
0,538000,3,2.25,2570,7242,1951,47.721,-122.319,POINT (-122.31900 47.72100)
1,180000,2,1.0,770,10000,1933,47.7379,-122.233,POINT (-122.23300 47.73790)
2,604000,4,3.0,1960,5000,1965,47.5208,-122.393,POINT (-122.39300 47.52080)
3,510000,3,2.0,1680,8080,1987,47.6168,-122.045,POINT (-122.04500 47.61680)
4,1230000,4,4.5,5420,101930,2001,47.6561,-122.005,POINT (-122.00500 47.65610)


2) There are 7 features to potentially use for predicting housing prices. We do not use price because it is what we are trying to predict. 

In [13]:
# Check for NaN values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19451 entries, 0 to 19450
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   price        19451 non-null  int64   
 1   bedrooms     19451 non-null  int64   
 2   bathrooms    19451 non-null  float64 
 3   sqft_living  19451 non-null  int64   
 4   sqft_lot     19451 non-null  int64   
 5   yr_built     19451 non-null  int64   
 6   lat          19451 non-null  float64 
 7   long         19451 non-null  float64 
 8   geometry     19451 non-null  geometry
dtypes: float64(3), geometry(1), int64(5)
memory usage: 1.3 MB


In [14]:
df.isnull().sum(axis=0)

price          0
bedrooms       0
bathrooms      0
sqft_living    0
sqft_lot       0
yr_built       0
lat            0
long           0
geometry       0
dtype: int64

3) There are 0 null values.

In [71]:
# Compute correlation matrix
corr_matrix = df.corr()

# Display just house value correlations
corr_matrix["price"].sort_values(ascending= False)

price          1.000000
sqft_living    0.702296
bathrooms      0.524395
bedrooms       0.315804
lat            0.308082
sqft_lot       0.090125
yr_built       0.052453
long           0.020092
Name: price, dtype: float64

4) The three best correlated with house prices are squared feet living with 0.702, bathrooms with 0.524, and bedrooms with 0.3158.

5) The three lowest correlated features with the house prices is long with 0.02, year built with 0.052, and squared feet lot with 0.09. The lat feature is in the middle of best correlated and lowest correlated with 0.308.

Part 2

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
# Import data
df = pd.read_csv('C:/Users/clynn/Documents/GitHub/geospatial-data-science/labs/lab5/seattle_house_prices.csv')
coast = gpd.read_file("C:/Users/clynn/Documents/GitHub/geospatial-data-science/labs/lab5/washington_coastline.shp")
# Examine dataset (each row represents one block group)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,lat,long
0,538000,3,2.25,2570,7242,1951,47.721,-122.319
1,180000,2,1.0,770,10000,1933,47.7379,-122.233
2,604000,4,3.0,1960,5000,1965,47.5208,-122.393
3,510000,3,2.0,1680,8080,1987,47.6168,-122.045
4,1230000,4,4.5,5420,101930,2001,47.6561,-122.005


In [4]:
# Convert DataFrame to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['long'], df['lat']))
gdf = gdf.set_crs(4326, allow_override=True)

# Reproject everything to UTM 10N (EPSG:32610)
gdf_utm = gdf.to_crs('EPSG:32610')
coast_utm = coast.to_crs('EPSG:32610')

In [5]:
# Define feature list
feature_list =  ['sqft_living', 'bathrooms', 'bedrooms', 'lat',
                 'sqft_lot', 'yr_built', 'long', ]

# Define features and labels 
X = gdf_utm[feature_list]
y = gdf_utm['price']

# Standarize data
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)

In [28]:
# Compute correlation matrix
corr_matrix = gdf_utm.corr()

# Display just house value correlations
corr_matrix["price"].sort_values(ascending= False)

price                 1.000000
sqftl_per_yr_built    0.708089
sqft_living           0.702296
sqftl_per_lat         0.700293
sqft_liv_per_bed      0.580062
bathrooms             0.524395
bedrooms              0.315804
long_lat              0.309133
lat                   0.308082
sqft_lot              0.090125
sqft_lot_per_lat      0.089649
yr_built              0.052453
sqft_lot_per_room     0.048319
distance_to_coast     0.027830
long                  0.020092
Name: price, dtype: float64

In [7]:
gdf_utm['sqftl_per_yr_built'] = gdf_utm['sqft_living'] / gdf_utm['yr_built']

In [8]:
gdf_utm['sqft_liv_per_bed'] = gdf_utm['sqft_living'] / gdf_utm['bedrooms']

In [9]:
gdf_utm['sqft_lot_per_room'] = gdf_utm['sqft_lot'] / gdf_utm['bedrooms']

In [10]:
gdf_utm['long_lat'] = gdf_utm['long'] / gdf_utm['lat']

In [11]:
gdf_utm['sqftl_per_lat'] = gdf_utm['sqft_living'] / gdf_utm['lat']

In [13]:
# Compute distance to coast
distance_to_coast = []
for i in range(gdf_utm.shape[0]):
    distance_to_coast.append(coast_utm.distance(gdf_utm['geometry'].iloc[i]).min())
    
# Add to DataFrame
gdf_utm['distance_to_coast'] = distance_to_coast

In [48]:
# Split data 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [49]:
# Define model
forest_reg = RandomForestRegressor(n_estimators = 30)

# Fit model
forest_reg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=30)

In [50]:
# Predict test labels predictions
predictions = forest_reg.predict(X_test)

# Compute mean-squared-error
final_mse = mean_squared_error(y_test , predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
# output is the number of errors. Lower the number the better

153666.61591451074