In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load and preprocess the dataset
dataset = pd.read_csv('.\\Q2-data\\test.csv')
X_train = dataset[['bedrooms', 'living_in_m2', 'real_bathrooms']]
Y_train = dataset[['price']]

# Scale features and target variable separately
sc_X = StandardScaler()
sc_Y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
Y_train = sc_Y.fit_transform(Y_train).flatten()  # Flatten to 1D for consistency

# Initialize weights and bias
rows, columns = X_train.shape
w = np.zeros(columns)  # 1D array for weights
b = 0

# Gradient Descent function
def gradient_Descent(X_train, Y_train, w, b):
    learning_rate = 0.01
    n, m = X_train.shape
    w_gradient = np.zeros_like(w)  # Initialize gradients for weights
    b_gradient = 0  # Initialize gradient for bias

    # Accumulate gradients
    for i in range(n):
        y_hat = np.dot(w, X_train[i]) + b
        error = (Y_train[i] - y_hat)
        w_gradient += -error * X_train[i]  # returns w_gradient vector with each cell containg error * x-subscriptj
        b_gradient += -error

    # Update weights and bias
    w = w - 1.0/n * learning_rate * w_gradient
    b = b - 1.0/n * learning_rate * b_gradient
    return w, b

# Cost function
def cost(X_train, Y_train, w, b):
    n, m = X_train.shape
    error = 0.0
    for i in range(n):
        y_hat = np.dot(w, X_train[i]) + b #X_train[i] will return ith row/sample as vetor/series. y_hat is scalar
        error += (Y_train[i] - y_hat) ** 2 #
    return error / float(n)

# Training loop
for it in range(4000):
    w, b = gradient_Descent(X_train, Y_train, w, b)
    if it % 200 == 0:
        err = cost(X_train, Y_train, w, b)
        print("Cost at iteration", it, ":", err)


Cost at iteration 0 : 0.8900971975539541
Cost at iteration 200 : 0.6164917500899688
Cost at iteration 400 : 0.6164914835133501
Cost at iteration 600 : 0.6164914835126203
Cost at iteration 800 : 0.6164914835126181
Cost at iteration 1000 : 0.6164914835126175
Cost at iteration 1200 : 0.6164914835126177
Cost at iteration 1400 : 0.6164914835126177
Cost at iteration 1600 : 0.6164914835126177
Cost at iteration 1800 : 0.6164914835126177


KeyboardInterrupt: 

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Function to predict values
def predict(X, w, b):
    return np.dot(X, w) + b

# Calculate accuracy metrics
def evaluate_model(X_train, Y_train, w, b):
    # Predict on training set
    Y_pred = predict(X_train, w, b)

    # Reverse scaling for Y_train and Y_pred
    Y_train_original = sc_Y.inverse_transform(Y_train.reshape(-1, 1)).flatten()
    Y_pred_original = sc_Y.inverse_transform(Y_pred.reshape(-1, 1)).flatten()

    print(Y_train_original )

    print(Y_pred_original)
    # Calculate evaluation metrics
    mse = mean_squared_error(Y_train_original, Y_pred_original)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(Y_train_original, Y_pred_original)
    r2 = r2_score(Y_train_original, Y_pred_original)

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R²):", r2)

# Evaluate the model after training
evaluate_model(X_train, Y_train, w, b)


[305000. 498000. 590000. ... 264000. 612125. 190000.]
[262795.09308854 508345.40297034 655577.18194603 ... 413327.13065413
 465380.40206229 346779.75929249]
Mean Absolute Error (MAE): 130577.4963291162
Mean Squared Error (MSE): 26680083521.333256
Root Mean Squared Error (RMSE): 163340.3915794659
R-squared (R²): 0.38350851648625495
