In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
import math

In [2]:
df = pd.read_csv("../Dataset/housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## Imputing the null values

In [4]:
imputer = SimpleImputer()
imputer.set_output(transform="pandas")
df2 = imputer.fit_transform(df.drop(columns='ocean_proximity'))

df2.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [5]:
df2.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [6]:
data_x = df2.drop(columns="median_house_value")
data_y = df2.median_house_value

In [7]:
scaler = StandardScaler()
scaler.set_output(transform="pandas")
data_x = scaler.fit_transform(data_x)

In [8]:
train_x, test_x,train_y,test_y = train_test_split(data_x, data_y, test_size=0.3)

In [9]:
sgd_reg = SGDRegressor()
sgd_reg.fit(train_x, train_y)

In [10]:
def  regsummry(y_true, y_pred):
    return{
         "r2_score": r2_score(y_true, y_pred),
        "mae": mean_absolute_error(y_true, y_pred),
        "mse": mean_squared_error(y_true, y_pred),
        "rmse": math.sqrt(mean_squared_error(y_true, y_pred))
    }

In [11]:
y_pred = sgd_reg.predict(test_x)
regsummry(test_y, y_pred)

{'r2_score': 0.6394057254057843,
 'mae': 50755.694612773834,
 'mse': 4843023277.463412,
 'rmse': 69591.83341070569}

### Tuning the max_iter, n_iter_no_change, eta0, power_t

In [12]:
sgd_reg = SGDRegressor(early_stopping=True, eta0=0.1, max_iter=5000, n_iter_no_change=30)
sgd_reg.fit(train_x, train_y)
y_pred = sgd_reg.predict(test_x)
regsummry(test_y, y_pred)

{'r2_score': 0.6366800982879979,
 'mae': 51794.19439294875,
 'mse': 4879630280.145247,
 'rmse': 69854.3504740059}