In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt

# # install seaborn with command "pip install seaborn"
# # https://seaborn.pydata.org/
# import seaborn as sns


# # install with command 'pip install statsmodels'
# import statsmodels.api as sm
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# %matplotlib inline

In [2]:
# Gather Data
california_dataset = fetch_california_housing()
data = pd.DataFrame(data=california_dataset.data, columns=california_dataset.feature_names)
features = data.drop(['Population'], axis=1)

log_prices = np.log(california_dataset.target)
target = pd.DataFrame(log_prices,columns=['PRICE'])

In [3]:
MedInc_IDX = 0
HouseAge_IDX = 1
AveRooms_IDX = 2
AveBedrms_IDX = 3
AveOccup_IDX = 4
Latitude_IDX = 5
Longtitude_IDX = 6
PRICE_IDX = 7

# property_stats = np.ndarray(shape=(1,7))
# property_stats[0][MedInc_IDX] = features['MedInc'].mean()

property_stats = features.mean().values.reshape(1,7)

property_stats

array([[   3.870671  ,   28.63948643,    5.42899974,    1.09667515,
           3.07065516,   35.63186143, -119.56970446]])

In [4]:
regr = LinearRegression().fit(features,target)
fitted_vals = regr.predict(features)

MSE = mean_squared_error(target,fitted_vals)
RMSE = np.sqrt(MSE)

In [5]:
# Lets keep this model simple 
def get_log_estimate(income,houseAge,roomNumber, high_confidence=True):
    # Configure property
    property_stats[0][MedInc_IDX] = income
    property_stats[0][HouseAge_IDX] = houseAge
    property_stats[0][AveRooms_IDX] = roomNumber
    
    # Make prediction
    log_estimate = regr.predict(property_stats)[0][0]

    # Calc Range
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE
        interval = 68

    return log_estimate, upper_bound, lower_bound, interval

In [6]:
# High Confidence result
get_log_estimate(2,25,3)



(0.2966206060853658, 1.0036565171061618, -0.4104153049354303, 95)

In [7]:
# Low Confidence Result
get_log_estimate(2,25,3,False)



(0.2966206060853658, 0.6501385615957638, -0.05689734942503227, 68)

In [8]:
np.median(california_dataset.target)

1.797

In [9]:
# We assume California median price today is $1,000,000
# This value can be adjust accordingly
# We will converts the log price estimate using 1970s prices

MEDIAN_PRICE = 10 # unit in 100,000
SCALE_FACTOR = MEDIAN_PRICE/np.median(california_dataset.target)

log_est, upper, lower, conf = get_log_estimate(2, 25, 3)

# Convert to today's dollars
dollar_est = np.e**log_est * 100000 * SCALE_FACTOR
dollar_high = np.e**upper * 100000 * SCALE_FACTOR
dollar_low = np.e**lower * 100000 * SCALE_FACTOR

# Round the dollar values to nearest thousand
rounded_est = np.around(dollar_est, -3)
rounded_high = np.around(dollar_high, -3)
rounded_low = np.around(dollar_low, -3)

print(f'The estimated property value is {rounded_est}.')
print(f'At {conf}% confidence the valuation range is')
print(f'USD {rounded_low} at the lower end to US {rounded_high} at the high end.')

The estimated property value is 749000.0.
At 95% confidence the valuation range is
USD 369000.0 at the lower end to US 1518000.0 at the high end.




In [10]:
def get_dollar_estimate(income,houseAge,roomNumber, high_confidence=True):
    """Estimate the price of a property in California
    
    """

    if roomNumber < 1 or houseAge < 1 or income < 1:
        print('That is unrealistic. Try again.')
        return

    log_est, upper, lower, conf = get_log_estimate(income,houseAge,roomNumber)

    # Convert to today's dollars
    dollar_est = np.e**log_est * 100000 * SCALE_FACTOR
    dollar_high = np.e**upper * 100000 * SCALE_FACTOR
    dollar_low = np.e**lower * 100000 * SCALE_FACTOR

    # Round the dollar values to nearest thousand
    rounded_est = np.around(dollar_est, -3)
    rounded_high = np.around(dollar_high, -3)
    rounded_low = np.around(dollar_low, -3)

    print(f'The estimated property value is ${rounded_est}.')
    print(f'At {conf}% confidence the valuation range is')
    print(f'USD {rounded_low} at the lower end to US {rounded_high} at the high end.')
        

In [11]:
get_dollar_estimate(income=2,houseAge=30,roomNumber=2, high_confidence=True)

The estimated property value is $782000.0.
At 95% confidence the valuation range is
USD 386000.0 at the lower end to US 1587000.0 at the high end.


