In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load and preprocess the dataset
dataset = pd.read_csv('.\\Q2-data\\train.csv')
X_train = dataset[['bedrooms', 'living_in_m2', 'real_bathrooms']]
Y_train = dataset[['price']]

# Scale features and target variable separately
sc_X = StandardScaler()
sc_Y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
Y_train = sc_Y.fit_transform(Y_train).flatten()  # Flatten to 1D for consistency

# Initialize weights and bias
rows, columns = X_train.shape
w = np.zeros(columns)  # 1D array for weights
b = 0

# Gradient Descent function
def gradient_Descent(X_train, Y_train, w, b):
    learning_rate = 0.01
    n, m = X_train.shape
    w_gradient = np.zeros_like(w)  # Initialize gradients for weights
    b_gradient = 0  # Initialize gradient for bias

    # Accumulate gradients
    for i in range(n):
        y_hat = np.dot(w, X_train[i]) + b
        error = (Y_train[i] - y_hat)
        w_gradient += -error * X_train[i]  # returns w_gradient vector with each cell containg error * x-subscriptj
        b_gradient += -error

    # Update weights and bias
    w = w - 1.0/n * learning_rate * w_gradient
    b = b - 1.0/n * learning_rate * b_gradient
    return w, b

# Cost function
def cost(X_train, Y_train, w, b):
    n, m = X_train.shape
    error = 0.0
    for i in range(n):
        y_hat = np.dot(w, X_train[i]) + b #X_train[i] will return ith row/sample as vetor/series. y_hat is scalar
        error += (Y_train[i] - y_hat) ** 2 
    return error / float(n)

# Training loop
for it in range(4000):
    w, b = gradient_Descent(X_train, Y_train, w, b)
    if it % 200 == 0:
        err = cost(X_train, Y_train, w, b)
        print("Cost at iteration", it, ":", err)


Cost at iteration 0 : 0.9878406373883164
Cost at iteration 200 : 0.6426556654278549
Cost at iteration 400 : 0.6223181537526674
Cost at iteration 600 : 0.6169359346592427
Cost at iteration 800 : 0.6154866562293687
Cost at iteration 1000 : 0.6150944426918851
Cost at iteration 1200 : 0.614988116245604
Cost at iteration 1400 : 0.6149592748959793
Cost at iteration 1600 : 0.6149514500275588
Cost at iteration 1800 : 0.6149493269383985
Cost at iteration 2000 : 0.6149487508759607
Cost at iteration 2200 : 0.6149485945704317
Cost at iteration 2400 : 0.6149485521592555
Cost at iteration 2600 : 0.6149485406516073
Cost at iteration 2800 : 0.6149485375291779
Cost at iteration 3000 : 0.6149485366819483
Cost at iteration 3200 : 0.614948536452065
Cost at iteration 3400 : 0.6149485363896877
Cost at iteration 3600 : 0.6149485363727655
Cost at iteration 3800 : 0.6149485363681729


In [16]:
# Load and preprocess test data
test_set = pd.read_csv('.\\Q2-data\\test.csv')
X_test = test_set[['bedrooms', 'living_in_m2', 'real_bathrooms']]
Y_test = test_set[['price']]

# Scale the test set features
X_test = sc_X.transform(X_test)
Y_test = sc_Y.transform(Y_test).flatten()  # Flatten for consistency

# Function to predict values
def predict(X, w, b):
    return np.dot(X, w) + b

# Calculate accuracy metrics on test data
def evaluate_model(X_test, Y_test, w, b):
    # Predict on test set
    Y_pred = predict(X_test, w, b)

    # Reverse scaling for Y_test and Y_pred
    Y_test_original = sc_Y.inverse_transform(Y_test.reshape(-1, 1)).flatten()
    Y_pred_original = sc_Y.inverse_transform(Y_pred.reshape(-1, 1)).flatten()

    print("y-orignal : ", Y_test_original )

    print("y-predicted : ",Y_pred_original)
    # Calculate evaluation metrics
    mse = mean_squared_error(Y_test_original, Y_pred_original)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(Y_test_original, Y_pred_original)
    r2 = r2_score(Y_test_original, Y_pred_original)

    print("\n\nperformance of model on test data: ")
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R²):", r2)

# Evaluate the model on test data
evaluate_model(X_test, Y_test, w, b)

y-orignal :  [305000. 498000. 590000. ... 264000. 612125. 190000.]
y-predicted :  [262820.7349257  510911.03115942 651325.31442065 ... 415505.42576091
 457721.36593563 345489.67633752]


performance of model on test data: 
Mean Absolute Error (MAE): 130582.33789644581
Mean Squared Error (MSE): 26690569190.943676
Root Mean Squared Error (RMSE): 163372.48602792234
R-squared (R²): 0.3832662261647649
