In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.datasets import fetch_california_housing
cali = fetch_california_housing()
x = cali.data # the features
y = cali.target # the actual price

data = pd.DataFrame(x, columns = cali.feature_names) # getting all the features
data["SalesPrice"] = y # creating a column for sale price
data.head() # gets the start of the data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,SalesPrice
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
print(cali.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [4]:
print(data.shape)

(20640, 9)


In [5]:
data["SalesPrice"] = np.log1p(data["SalesPrice"])

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = data.drop("SalesPrice", axis = 1)
y = data["SalesPrice"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=42)
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)


In [7]:
minChange = 0.00001
learning_rate = 0.001
strictness = 1
prev = -1
x_train_augmented = np.hstack([np.ones((x_train.shape[0], 1)), x_train])
thetas = np.zeros(x_train_augmented.shape[1])

current = np.mean((np.dot(x_train_augmented, thetas) - y_train) ** 2)

while abs(current - prev) > minChange:
  predictions = np.dot(x_train_augmented, thetas)
  errors = predictions - y_train
  gradient = (1/(y_train.shape[0])) * np.dot(x_train_augmented.T, errors) + (2 * strictness * thetas ** 2)
  thetas = thetas - (gradient * learning_rate)
  prev = current
  current = np.mean((np.dot(x_train_augmented, thetas) - y_train) ** 2)







In [14]:
from sklearn.metrics import mean_squared_error
x_test_augmented = np.hstack([np.ones((x_test.shape[0], 1)), x_test])


h_values = np.dot(x_test_augmented, thetas)
mse = mean_squared_error(y_test, h_values)
rsme = np.sqrt(mse)

print(mse)



0.3510277893775474


In [17]:
prediction = np.dot(x_test_augmented[2144], thetas)


print(x_test_augmented[2144])
print(y_test.loc[2144])

print(prediction)


[ 1.          1.1912917  -0.38063221  0.49313036 -0.24244125 -0.6693416
 -0.04366094 -0.72530336  1.16303662]
0.5440669575457926
0.7094559205829141
