In [1]:
from __future__ import division,print_function,unicode_literals
import numpy as np
import os
np.random.seed(42)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
fetch_housing_data()

In [4]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [5]:
data = load_housing_data()

In [6]:
type(data)

pandas.core.frame.DataFrame

In [7]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [8]:
train_set_X=train_set.drop(["median_house_value","ocean_proximity"],axis=1)
train_set_Y=train_set['median_house_value']

In [9]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")

In [10]:
imputer.fit(train_set_X)

Imputer(axis=0, copy=True, missing_values='NaN', strategy=u'median',
    verbose=0)

In [11]:
X = imputer.transform(train_set_X) #gives a numpy array. Need to transform this to a pandan DataFrame

In [12]:
new_train_set_X=pd.DataFrame(X,columns=train_set_X.columns,index=list(train_set_X.index.values))

In [13]:
# new_train_set_X.head()

In [14]:
X=new_train_set_X.as_matrix()
Y_actutal=train_set_Y.as_matrix()
y=np.array(train_set_Y).flatten()

In [15]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha = 0.1)
clf.fit(X,y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [16]:
print(clf.coef_)

[-4.26320206e+04 -4.24497252e+04  1.18281484e+03 -8.18800849e+00
  1.16259576e+02 -3.84922290e+01  4.63434529e+01  4.05384212e+04]


In [17]:
print (clf.intercept_)

-3578192.468399363


In [19]:
test_set_X=train_set.drop(["median_house_value","ocean_proximity"],axis=1)
test_set_Y=train_set['median_house_value']

In [20]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
imputer.fit(test_set_X)
X = imputer.transform(test_set_X) #gives a numpy array. Need to transform this to a pandan DataFrame

In [None]:
new_test_set_X=pd.DataFrame(X,columns=test_set_X.columns,index=list(test_set_X.index.values))

In [None]:
# new_test_set_X.head()

In [None]:
X=new_test_set_X.as_matrix()
Y_actutal=test_set_Y.as_matrix()
y=np.array(test_set_Y).flatten()

In [21]:
y_pred =clf.predict(X)

In [22]:
y_pred

array([181313.20734562, 286451.83166873, 263327.28037206, ...,
       191338.60230213, 273177.79287179, 278432.69275394])

In [25]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_pred,y)
rmse=np.sqrt(mse)
print ("Mean square error (test data): ")
print (mse)
print ("Root mean square error (test data):")
print (rmse)

Mean square error (test data): 
4811134397.953735
Root mean square error (test data):
69362.34135288208


Training on the entire data

In [26]:
y_df=data['median_house_value'] #y
X_df=data.drop(["median_house_value","ocean_proximity"],axis=1)

In [27]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")

In [28]:
imputer.fit(X_df)

Imputer(axis=0, copy=True, missing_values='NaN', strategy=u'median',
    verbose=0)

In [29]:
X = imputer.transform(X_df) #gives a numpy array. Need to transform this to a pandan DataFrame

In [30]:
new_X_df=pd.DataFrame(X,columns=X_df.columns,index=list(X_df.index.values))

In [31]:
# X=np.array(new_X_df)
X=new_X_df.as_matrix()
Y_actutal=y_df.as_matrix()
y=np.array(y_df).flatten()

In [32]:
clf.fit(X,y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [34]:
y_pred = clf.predict(X)

In [35]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_pred,y)
rmse=np.sqrt(mse)
print ("Mean square error: ")
print (mse)
print ("Root mean square error")
print (rmse)

Mean square error: 
4852263483.710224
Root mean square error
69658.19035626912
