In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [2]:
#Gathering Data
cali_dataset = fetch_california_housing()
data = pd.DataFrame(data=cali_dataset.data,columns=cali_dataset.feature_names)
# print(data)
features = data.drop(['Population','AveOccup'],axis=1)
log_prices = np.log(cali_dataset.target)
target = pd.DataFrame(log_prices, columns=['PRICE'])

In [3]:
print (features)
inc_idx=0
age_idx=1
rooms_idx=2
bedrms_idx=3
lati_idx=4
longi_idx=5

       MedInc  HouseAge  AveRooms  AveBedrms  Latitude  Longitude
0      8.3252      41.0  6.984127   1.023810     37.88    -122.23
1      8.3014      21.0  6.238137   0.971880     37.86    -122.22
2      7.2574      52.0  8.288136   1.073446     37.85    -122.24
3      5.6431      52.0  5.817352   1.073059     37.85    -122.25
4      3.8462      52.0  6.281853   1.081081     37.85    -122.25
...       ...       ...       ...        ...       ...        ...
20635  1.5603      25.0  5.045455   1.133333     39.48    -121.09
20636  2.5568      18.0  6.114035   1.315789     39.49    -121.21
20637  1.7000      17.0  5.205543   1.120092     39.43    -121.22
20638  1.8672      18.0  5.329513   1.171920     39.43    -121.32
20639  2.3886      16.0  5.254717   1.162264     39.37    -121.24

[20640 rows x 6 columns]


In [4]:
property_stats = features.mean().values.reshape(1,6)
print(property_stats)

[[   3.870671     28.63948643    5.42899974    1.09667515   35.63186143
  -119.56970446]]


In [5]:
regr = LinearRegression().fit(features,target)
fitted_vals = regr.predict(features)

In [6]:
#Calculating the MSE and RMSE using sklearn
MSE = mean_squared_error(target,fitted_vals)
RMSE = np.sqrt(MSE)

In [9]:
def get_log_estimate(house_age, no_rooms, no_bdrooms,high_confidence=True):
    #configuring property
    property_stats[0][rooms_idx]=no_rooms
    property_stats[0][bedrms_idx]=no_bdrooms
    property_stats[0][age_idx]=house_age
    
    #Making prediction
    log_estimate = regr.predict(property_stats)[0][0]
    
    #Calculating Range
    if high_confidence:
        upper_bound=log_estimate + 2*RMSE
        lower_bound=log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound=log_estimate + RMSE
        lower_bound=log_estimate - RMSE
        interval = 68       
    return log_estimate, upper_bound, lower_bound, interval

In [36]:
get_log_estimate(20,5,1,high_confidence=False)



(0.5457873332978842, 0.8996997205998898, 0.19187494599587868, 68)

# Price Scaling

In [28]:
#Scaling as the price increases as per time.
median_price = 6.0 
# $600,000 as per ramseysolutions
scale_factor = median_price / np.median(cali_dataset.target)
log_est, upper, lower, conf = get_log_estimate(20,5,1,high_confidence=False)
#Converting to today's dollar
dollar_est = np.e**log_est*100000*scale_factor
dollar_hi = np.e**upper*100000*scale_factor
dollar_low = np.e**lower*100000*scale_factor
#Rounding the values to nearest hundred thousand
rounded_est = int(np.around(dollar_est, -3))
rounded_hi = int(np.around(dollar_hi,-3))
rounded_low = int(np.around(dollar_low,-3))
#
print(f'The estimated property values is ${rounded_est}')
print(f'At {conf}% confidence the vauation range is:')
print(f' ${rounded_low} at the lowest to ${rounded_hi} at the highest')


The estimated property values is $576000
At 68% confidence the vauation range is:
 $405000 at the lowest to $821000 at the highest


