In [214]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [215]:
df = pd.read_csv('housing.csv')

In [216]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.325,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.301,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.257,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.643,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.846,342200.0,NEAR BAY


In [217]:
df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [218]:
df = pd.get_dummies(df, columns=['ocean_proximity'])

In [219]:
df = df.dropna()

In [220]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [221]:
scaler = MinMaxScaler()

df[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN']] = scaler.fit_transform(
    df[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN']])

In [222]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.211,0.567,0.784,0.022,0.02,0.009,0.021,0.54,0.902,0.0,0.0,0.0,1.0,0.0
1,0.212,0.565,0.392,0.181,0.171,0.067,0.187,0.538,0.708,0.0,0.0,0.0,1.0,0.0
2,0.21,0.564,1.0,0.037,0.029,0.014,0.029,0.466,0.695,0.0,0.0,0.0,1.0,0.0
3,0.209,0.564,1.0,0.032,0.036,0.016,0.036,0.355,0.673,0.0,0.0,0.0,1.0,0.0
4,0.209,0.564,1.0,0.041,0.043,0.016,0.042,0.231,0.675,0.0,0.0,0.0,1.0,0.0


In [223]:
X = df[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income','ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN']]
y= df[['median_house_value']]

In [224]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [225]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [226]:
coeff_df = pd.DataFrame(np.transpose(regressor.coef_), X.columns, columns=['Coefficient'])
coeff_df
#np.transpose(regressor.coef_).shape

Unnamed: 0,Coefficient
longitude,-0.57
latitude,-0.516
housing_median_age,0.112
total_rooms,-0.354
total_bedrooms,1.168
population,-3.369
households,0.901
median_income,1.167
ocean_proximity_<1H OCEAN,1394243721361.274
ocean_proximity_INLAND,1394243721361.197


In [227]:
y_pred = regressor.predict(X_val)

In [228]:
y_pred

array([[0.32763672],
       [0.23901367],
       [0.31396484],
       ...,
       [0.45117188],
       [0.16235352],
       [0.48413086]])

In [229]:
pd.DataFrame(np.hstack((y_val.to_numpy(), y_pred.reshape(-1, 1))))

Unnamed: 0,0,1
0,0.179,0.328
1,0.175,0.239
2,0.232,0.314
3,0.299,0.350
4,0.158,0.306
...,...,...
4082,0.162,0.184
4083,0.184,0.347
4084,0.684,0.451
4085,0.259,0.162


In [230]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

Mean Absolute Error: 0.10572246480043591
Mean Squared Error: 0.022465777030762482
Root Mean Squared Error: 0.1498858800246457


In [231]:
regressor.score(X_val,y_val)

0.6147060255712351