In [20]:
#importing the libraries
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

In [21]:
class linear_regression:    #class for linear regression
    def __init__(self, X, y, reg, lamda): #initializing the class
        self.X = X 
        self.y = y  
        self.lamda = lamda
        self.reg = reg
        
    def splitData(self): #splitting the data into training and testing
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=0)
        return X_train, X_test, y_train, y_test

    def add_X0(self, X): #adding the bias term to the input features
        return np.column_stack([np.ones([X.shape[0],1]),X])
    
    def normalize_data(self, X): #normalizing the input features
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)
        X_normalized = (X - mean) / std
        X_normalized = self.add_X0(X_normalized)
        return X_normalized, mean, std

    def closed_form_solution(self, X, y, reg): #solving the linear regression using closed form solution
        if reg == False:
            return np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        else:
            return np.linalg.inv(X.T.dot(X) - (self.lamda) * np.eye(X.shape[1])).dot(X.T).dot(y)
    
    def predict(self, X): #predicting the output using the trained model
        return X.dot(self.w)

    def rmse(self, X, y): #calculating the root mean square error
        y_hat = self.predict(X)
        return np.sqrt(((y_hat - y) ** 2).mean())

    def sse(self, X, y): #calculating the sum of square error
        y_hat = self.predict(X)
        return ((y_hat - y) ** 2).sum()
        
    def fit(self): #fitting the model
        X_train, X_test, y_train, y_test = self.splitData()
        X_train_normalized, self.train_mean, self.train_std = self.normalize_data(X_train)
        X_test_normalized = (X_test - self.train_mean) / self.train_std
        X_test_normalized = self.add_X0(X_test_normalized)
        
        self.type_of_model = "Solving using closed-form " + ("Regularised" if self.reg else "Not-Regularised")
        print(self.type_of_model)
        
        self.w = self.closed_form_solution(X_train_normalized, y_train, self.reg)
        test_rmse = self.rmse(X_test_normalized, y_test)
        test_sse = self.sse(X_test_normalized, y_test)
        print("Root Mean Square Error: ", test_rmse)
        print("Sum of Square Error: ", test_sse)


In [15]:
# Fetch the data
cal_housing = fetch_california_housing()

# Create an instance of linear_regression class
lr = linear_regression(cal_housing.data, cal_housing.target, reg=False, lamda=0.0005)

# Train the model
lr.fit()

Solving using closed-form Not-Regularised
Root Mean Square Error:  0.7273129773603114
Sum of Square Error:  2183.646641527584


In [17]:
lr_reg = linear_regression(cal_housing.data, cal_housing.target, reg=True, lamda=0.0005)

# Train the model

lr_reg.fit()

Solving using closed-form Regularised
Root Mean Square Error:  0.7273129741935046
Sum of Square Error:  2183.646622511872


Validating the model using the Sklearn Library

In [18]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Loading the California Housing dataset
cal_housing = fetch_california_housing()

# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=0)

# Creating a pipeline with feature scaling and linear regression
model = make_pipeline(StandardScaler(), LinearRegression())

# Training the model on the training data
model.fit(X_train, y_train)

# Evaluating the model on the testing data
test_rmse = np.sqrt(((model.predict(X_test) - y_test) ** 2).mean())
print("Root Mean Square Error (Test): ", test_rmse)

# Performing cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(model, cal_housing.data, cal_housing.target, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores.mean())
print("Root Mean Square Error (Cross-Validation): ", cv_rmse)


Root Mean Square Error (Test):  0.7273129773603114
Root Mean Square Error (Cross-Validation):  0.7471881769465138
