# Mini Project 2 - Housing Prediction

Objective
* Determine what key variables predict housing prices
* Predict the house price 1 year, 2 years, 5 years from today
* Is it a good investment to buy a house today?
* Is it a good investment to sell a house today?

Steps
1. Load and explore data
2. Build linear regression model
3. Predict house sale price
4. Iterate and improve model

## 1. Load and Explore Data

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

print('Numpy: ', np.__version__)
print('Pandas: ', pd.__version__)
print('Seaborn: ', sns.__version__)

Numpy:  1.17.2
Pandas:  0.25.1
Seaborn:  0.9.0


In [3]:
# Load data
house = pd.read_csv('../../Data/ames-housing.csv')
house.head()

FileNotFoundError: [Errno 2] File b'../../Data/ames-housing.csv' does not exist: b'../../Data/ames-housing.csv'

In [None]:
# Check shape
house.shape

In [None]:
# Check column names
house.columns

In [None]:
# Check data type
pd.set_option('display.max_rows', 81)
house.dtypes

In [None]:
# Check null values
house.isnull().sum()[house.isnull().sum() > 0]

In [None]:
house.corr()

In [None]:
house['OverallQual'].describe()

In [None]:
house['Neighborhood'].value_counts().sort_values().plot(kind='barh', figsize=(15,10));

In [None]:
# Plot YrSold versus SalePrice
import matplotlib.pyplot as plt
plt.scatter(house['YrSold'], house['SalePrice'])

# Show only intergers, no decimals
import math
xint = range(min(house['YrSold']), math.ceil(max(house['YrSold']))+1)
plt.xticks(xint)
plt.xlabel('Year Sold')
plt.ylabel('Sale Price')
plt.show()

In [None]:
# Plot LotArea versus SalePrice
# LotArea defined as: Lot size in square feet
plt.scatter(house['LotArea'], house['SalePrice'])
plt.xlabel('Lot Area')
plt.ylabel('Sale Price')
plt.show()

Would like to identify and remove the outliers, likely where the Lot Area > 100,000. But let's reach a prediction value first then go back and refine.

In [None]:
# Visualise distribution of price
house['SalePrice'].hist(bins=20)
plt.show()

In [None]:
# Visualise distribution of Year when property was built
house['YearBuilt'].hist(bins=20)
plt.show()

In [None]:
# Looking at price trend over the period of the data provide
YearMeanPrice=house.groupby('YrSold')['SalePrice'].mean()
print(YearMeanPrice)

## 2. Build the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# For the first iteration, we are going to use LotArea as the predictor variable for SalePrice

# Reshape the data into 2D array
X = house['LotArea'].values.reshape(-1,1)
y = house['SalePrice'].values.reshape(-1,1)

# Find coefficient and intercept using linear regression
linreg = LinearRegression()
linreg.fit(X, y)
print('Coefficient:', linreg.coef_, 'Intercept:', linreg.intercept_)

# Find R^2
print('R^2:', linreg.score(X, y))

In [None]:
# Split the data
from sklearn.model_selection import train_test_split

# Create training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# Check shape and sample of test train data
print('X = Lot Area')
print('X_train is a',  type(X_train))
print('X_train rows:', X_train.shape[0])
print('Sample of X_train:')
print(X_train[:10])
print('')
print('y = Sale Price')
print('y_train is a',  type(y_train))
print('y_train rows:', y_train.shape[0])
print('Sample of y_train:')
print(y_train[:10])

In [None]:
# Find fit model using X_train, y_train
linreg.fit(X_train, y_train)
print('Coefficient: ', linreg.coef_, 'Intercept:', linreg.intercept_)

# Find score using X_test, y_test
print('R^2:', linreg.score(X_test, y_test))

In [None]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error
from math import sqrt

preds = linreg.predict(X_test)
mse = sqrt(mean_squared_error(y_test, preds))
print('Mean Squared Error:', mse)

## 3. Predict House Sale Price

In [None]:
# Check if there if a fit model
linreg.intercept_, linreg.coef_

In [None]:
# Predict price if Lot Area = 10000
x = [[10000]]
Ypred = linreg.predict(x)
print('Predicted house sale price:', Ypred)

In [None]:
# Predict price if Lot Area = 50000
x = [[50000]]
Ypred = linreg.predict(x)
print('Predicted house sale price:', Ypred)

In [None]:
# Predict price if Lot Area = 150000
x = [[150000]]
Ypred = linreg.predict(x)
print('Predicted house sale price:', Ypred)

In [None]:
# Plot x and y
e = house['LotArea']
f = house['SalePrice']
plt.plot(e, f, 'o')

m, b = np.polyfit(e, f, 1)
plt.plot(e, m*e + b)
plt.show()

## 4. Iterate and Improve Model
Let's experiment with adding multiple features to see if it improves the R^2 score

In [None]:
# This is the original model to beat
X = house['LotArea'].values.reshape(-1,1)
y = house['SalePrice'].values.reshape(-1,1)
linreg = LinearRegression()
linreg.fit(X, y)
print('Coefficient:', linreg.coef_, 'Intercept:', linreg.intercept_)
# Find R^2
print('R^2:', linreg.score(X, y))

In [None]:
# Add in more features
X = house[['LotArea', 'YrSold']]
y = house['SalePrice']

# Split the data


# Find coefficient and intercept using linear regression
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X, y)
print('Coefficient:', linreg.coef_, 'Intercept:', linreg.intercept_)

# Find R^2
print('R^2:', linreg.score(X, y))

In [None]:
# Use heatmap to explore what other features to include
plt.figure(figsize=(32,20))
cor = house.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
# Correlation with target variable
cor_target = abs(cor['SalePrice'])

# Select highligh correlated features
relevant_features = cor_target[cor_target>0.5].sort_values(ascending=False)
relevant_features

In [None]:
house['OverallQual'].describe()

In [None]:
# Of theses top variables, let's visualize the correlations
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']
house[cols].corr()

In [None]:
sns.pairplot(house[cols])
plt.show()

In [None]:
house['OverallQual'].describe()

In [None]:
house['GrLivArea'].describe()

In [None]:
house['GarageArea'].describe()

In [None]:
# Reshape the data into 2D array
X2 = house[['OverallQual', 'GrLivArea', 'GarageArea']]
y = house['SalePrice']


# Turn this entire thing into a function
# Create training and testing subsets
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size = 0.2)

# Find coefficient and intercept using linear regression
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X2_train, y_train)
print('Coefficient:', linreg.coef_, 'Intercept:', linreg.intercept_)

# Find R^2
print('R^2 train:', linreg.score(X2_train, y_train))
print('R^2 test:', linreg.score(X2_test, y_test))

Predict house prices again using new model

In [None]:
# Check there if a fit model
linreg.intercept_, linreg.coef_

In [None]:
# Predict price if OverallQual = 6, GrLivArea = 1515, GarageArea = 472
x = [[6, 1515, 472]]
pred = linreg.predict(x)
print("Predicted Sale Price is ${}".format(round(pred[0], 2)))

In [None]:
# Find mean squared error
from sklearn.metrics import mean_squared_error
from math import sqrt

preds2 = linreg.predict(X2_test)
mse = sqrt(mean_squared_error(y_test, preds2))
print('Mean Squared Error:', mse)

In [None]:
# Clean Lot Area data by excluding: Lot Area > 100,000
# Based on our domain knowledge and one standard dev from mean

print(plt.scatter(house['LotArea'], house['SalePrice']))
plt.title('Original')
plt.xlabel('LotArea')
plt.ylabel('SalePrice')
plt.plot(house['LotArea'], house['SalePrice'], 'o')
m, b = np.polyfit(house['LotArea'], house['SalePrice'], 1)
plt.plot(house['LotArea'], m*house['LotArea'] + b)
plt.show()


house2 = house[house['LotArea'] < 50000]
print(plt.scatter(house2['LotArea'], house2['SalePrice']))
plt.title('Cleaned')
plt.xlabel('LotArea')
plt.ylabel('SalePrice')
plt.plot(house2['LotArea'], house2['SalePrice'], 'o')
m, b = np.polyfit(house2['LotArea'], house2['SalePrice'], 1)
plt.plot(house2['LotArea'], m*house2['LotArea'] + b)
plt.show()

In [None]:
# Original
X = house['LotArea'].values.reshape(-1,1)
y = house['SalePrice'].values.reshape(-1,1)
linreg = LinearRegression()
linreg.fit(X, y)
print('Coefficient:', linreg.coef_, 'Intercept:', linreg.intercept_)
print('R^2:', linreg.score(X, y))

In [None]:
# Cleaned
X = house2['LotArea'].values.reshape(-1,1)
y = house2['SalePrice'].values.reshape(-1,1)
linreg = LinearRegression()
linreg.fit(X, y)
print('Coefficient:', linreg.coef_, 'Intercept:', linreg.intercept_)
print('R^2:', linreg.score(X, y))