In [1]:
# This code is to build a linear regression model for predicting house values in california
# Model will be trained based on 1990 U.S. Census. Data is availble at sklearn
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedShuffleSplit
import os, ssl

In [2]:
# Get the California housing data from sklearn datasets.
# It has the data about 20640 housing in California as per 1990 U.S. Census.
# It contains the various parameters that describes the house and also the value of the house
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)): 
    ssl._create_default_https_context = ssl._create_unverified_context
d_house_data = fetch_california_housing(data_home=None, download_if_missing=True, return_X_y=False)

In [3]:
# We will be using pandas dataframes. Convert the data to pandas dataframes
# There are two data elements. First is parameters about the house and second is the value of the house
# d_house is the data about various house parameters
# d_house_value is the data corresponding house values
d_house = pd.DataFrame(data=d_house_data.data, columns=d_house_data.feature_names)
d_house_value = pd.DataFrame(data=d_house_data.target, columns=['HouseValue'])

In [4]:
# There are varioud house parameters
# MedInc - Median income for households within a block of houses
# HouseAge - Age of a house within a block; a lower number is a newer building
# AveRooms - Average number of rooms within a block for a house
# AveBedrms - Average number of bedrooms within a block for a house
# Population - Total number of people residing within a block
# AveOccup - Average number of people residing within a home unit, for a block
# Latitude - A measure of how far north a house is; a higher value is farther north
# Longitude - A measure of how far west a house is; a more negative value is farther west
d_house_data.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [5]:
# HouseValue - House value for a given block
d_house_value.head()

Unnamed: 0,HouseValue
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [6]:
# Understand the house hold data
# Make sure that there are no missing values for HouseValue. Have 20640 rows corresponding to house block data
d_house_value.describe()

Unnamed: 0,HouseValue
count,20640.0
mean,2.068558
std,1.153956
min,0.14999
25%,1.196
50%,1.797
75%,2.64725
max,5.00001


In [7]:
# All 20640 are non null values. We are good.
d_house_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   HouseValue  20640 non-null  float64
dtypes: float64(1)
memory usage: 161.4 KB


In [8]:
# Observe the house block data, their min, max and how each data element is spread
# Notice that, mean of MedInc is 3.87 but max value goes all the way up to 15.
# We will use this information later while picking the correct sample for fitting the model.
d_house.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [9]:
# All parameters of house block data is non null. Do not have any missing null data
d_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [10]:
#Combine both data frames and see the correlation for HouseValue as dependent variable
combined_house_data = pd.concat([d_house,d_house_value], axis=1, join="inner")

In [11]:
# HouseValue has obviousy correlation value of 1. MedInc has highest next correlation
corr_data = combined_house_data.corr()
print(corr_data.HouseValue.sort_values(ascending=False))

HouseValue    1.000000
MedInc        0.688075
AveRooms      0.151948
HouseAge      0.105623
AveOccup     -0.023737
Population   -0.024650
Longitude    -0.045967
AveBedrms    -0.046701
Latitude     -0.144160
Name: HouseValue, dtype: float64


In [12]:
# We can drop the independent parameters that we do not think add any additional value.
combined_house_data = combined_house_data.drop(labels=['AveOccup'],axis=1, inplace=False)

In [13]:
# MedInc has highest correlation. We need to make sure that appropriate proportion of train and test data 
# samples are picked representing across the spread of MedInc
# Let us create a Catregory variable based on various slabs of MedInc. We need this for StratifiedShuffleSplit to work
combined_house_data['MedIncCat'] = pd.cut(combined_house_data['MedInc'], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

In [14]:
# Perform StratifiedShuffleSplit to appropriately pick samples representing across MedInc sprectrum
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
for train_idx, test_idx in split.split(combined_house_data, combined_house_data["MedIncCat"]):
    stratified_train_data = combined_house_data.loc[train_idx]
    stratified_test_data = combined_house_data.loc[test_idx]
# Now drop MedIncCat. We do not need it further  
stratified_train_data.drop('MedIncCat', axis=1, inplace=True)
stratified_test_data.drop('MedIncCat', axis=1, inplace=True)

In [15]:
# Combined house data has both independent and dependent parameters. 
# Seperate them out. We need it to fit and test the model
stratified_train = stratified_train_data.drop('HouseValue', axis=1)
stratified_train_target = stratified_train_data['HouseValue'].copy()
stratified_test = stratified_test_data.drop('HouseValue', axis=1)
stratified_test_target = stratified_test_data['HouseValue'].copy()

In [16]:
# Define the regression model and fit
regModel = LinearRegression()
regModel.fit(stratified_train,stratified_train_target)

LinearRegression()

In [17]:
# Observe the score of the regression model. It may vary.
regModel.score(stratified_train,stratified_train_target)

0.6036419398447251

In [18]:
# Perform the prediction and observe the R2 score
predict_set = regModel.predict(stratified_test)
print("Coefficient based on test data: %.2f" % r2_score(stratified_test_target, predict_set))

Coefficient based on test data: 0.60


In [19]:
# We have successfully created the regression model to predict the hose value in California based on 1990 U.S. Census