## (1) Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("california_housing_train.csv")
test_df = pd.read_csv("california_housing_test.csv")

In [3]:
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [4]:
test_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


## (2) Tasks

### (A) Data preprocessing

#### Checking Null Values

In [5]:
## checking null values in train dataset
train_df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [6]:
## checking null values in test dataset
test_df.isnull().sum()

## thus we dont have to deal with any null or missing values

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

#### Dividing the datasets into x_train, x_test, y_train, y_test

In [7]:
x_train = np.array(train_df.iloc[:, :-1])
y_train = np.array(train_df.iloc[:, -1]).reshape(-1,1)

In [8]:
x_train

array([[-114.31  ,   34.19  ,   15.    , ..., 1015.    ,  472.    ,
           1.4936],
       [-114.47  ,   34.4   ,   19.    , ..., 1129.    ,  463.    ,
           1.82  ],
       [-114.56  ,   33.69  ,   17.    , ...,  333.    ,  117.    ,
           1.6509],
       ...,
       [-124.3   ,   41.84  ,   17.    , ..., 1244.    ,  456.    ,
           3.0313],
       [-124.3   ,   41.8   ,   19.    , ..., 1298.    ,  478.    ,
           1.9797],
       [-124.35  ,   40.54  ,   52.    , ...,  806.    ,  270.    ,
           3.0147]])

In [9]:
y_train

array([[ 66900.],
       [ 80100.],
       [ 85700.],
       ...,
       [103600.],
       [ 85800.],
       [ 94600.]])

In [10]:
x_test = np.array(test_df.iloc[:, :-1])
y_test = np.array(test_df.iloc[:, -1]).reshape(-1, 1)

In [11]:
x_test

array([[-122.05  ,   37.37  ,   27.    , ..., 1537.    ,  606.    ,
           6.6085],
       [-118.3   ,   34.26  ,   43.    , ...,  809.    ,  277.    ,
           3.599 ],
       [-117.81  ,   33.78  ,   27.    , ..., 1484.    ,  495.    ,
           5.7934],
       ...,
       [-119.7   ,   36.3   ,   10.    , ...,  693.    ,  220.    ,
           2.2895],
       [-117.12  ,   34.1   ,   40.    , ...,   46.    ,   14.    ,
           3.2708],
       [-119.63  ,   34.42  ,   42.    , ...,  753.    ,  260.    ,
           8.5608]])

In [12]:
y_test

array([[344700.],
       [176500.],
       [270500.],
       ...,
       [ 62000.],
       [162500.],
       [500001.]])

#### Implementing Standardization from scratch

#### We have a mathematical formula for standardization
#### Z-Score = (Current_value - Mean) / Standard Deviation.

In [13]:
def Standardization(array): ## for the dependent variables
    rows, cols = array.shape  
    res = np.zeros((rows, cols))
    for col in range(cols):
        feature = array[:, col]  
        feature = (feature - np.mean(feature)) / np.std(feature)
        res[:, col] = feature   
    return res

In [14]:
# y_train = Standardization(y_train)
# y_test = Standardization(y_test)

In [15]:
x_test = Standardization(x_test)
x_train = Standardization(x_train)

In [16]:
x_train

array([[ 2.619365  , -0.67152023, -1.07967114, ..., -0.36118401,
        -0.07599796, -1.25254316],
       [ 2.53956878, -0.57326437, -0.76187201, ..., -0.26186523,
        -0.09940441, -1.08148298],
       [ 2.4946834 , -0.90546278, -0.92077158, ..., -0.95535424,
        -0.99925206, -1.17010515],
       ...,
       [-2.36291168,  2.90780067, -0.92077158, ..., -0.16167524,
        -0.11760942, -0.44666313],
       [-2.36291168,  2.88908527, -0.76187201, ..., -0.1146295 ,
        -0.06039367, -0.99778717],
       [-2.387848  ,  2.29955006,  1.85997083, ..., -0.54326844,
        -0.60134255, -0.45536288]])

### (B) Implementing Normal Equation

In [17]:
def NormalEquation(x_train, y_train):
    modified_x_train = np.c_[np.ones((x_train.shape[0], 1)), x_train] ## adding a column of ones (adding the intercept)
    transposed_modified_x_train = np.transpose(modified_x_train)

    params = np.linalg.inv((transposed_modified_x_train @ modified_x_train)) @ (transposed_modified_x_train @ y_train)
    return params

In [18]:
params = NormalEquation(x_train = x_train, y_train = y_train)

In [19]:
params

array([[207300.91235294],
       [-86499.60728226],
       [-91744.0508306 ],
       [ 14483.29860201],
       [-18263.60700749],
       [ 49587.33777166],
       [-44178.15153005],
       [ 17470.57604338],
       [ 77291.55314914]])

In [20]:
modified_x_test = np.c_[np.ones((x_test.shape[0], 1)), x_test]
y_pred = modified_x_test@params

In [21]:
y_pred.shape

(3000, 1)

In [22]:
def PerformaceMetrics(y_pred, y_test):
    mae = np.mean(np.abs(y_pred - y_test))
    mse = np.mean((y_pred - y_test) ** 2)

    print("The Mean Squared Error is: ", mse)
    print("The Mean Absolute Error is: ", mae)


In [23]:
y_pred

array([[358460.00878369],
       [216569.51471133],
       [276384.95549539],
       ...,
       [ 90664.64488211],
       [151950.61857716],
       [466849.37460678]])

In [24]:
y_test

array([[344700.],
       [176500.],
       [270500.],
       ...,
       [ 62000.],
       [162500.],
       [500001.]])

In [25]:
PerformaceMetrics(y_pred = y_pred, y_test = y_test)

The Mean Squared Error is:  4886020526.855478
The Mean Absolute Error is:  50669.4460909907


In [26]:
from sklearn.metrics import r2_score
print(r2_score(y_true=y_test, y_pred=y_pred))

0.6180349003715606


### (C) Implementing Batch Gradient Descent

### (D) Comparison with sklearn

In [27]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
sk_y_pred = model.predict(x_test)
print(r2_score(y_true=y_test, y_pred = sk_y_pred))

0.6180349003715608


### (E) Visualizations

### (F) Evaluation Metrics