# Linear Regression
## DSMLAI Content Intern Assignment
### Dataset: Housing Prices Dataset (Kaggle)

### Objective
To implement Linear Regression manually without using ML libraries and understand how the algorithm works internally using gradient descent optimization.


In [44]:
import pandas as pd
import numpy as np

In [45]:
df = pd.read_csv("Housing.csv")
df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [46]:
print("Shape of dataset:", df.shape)
print("\nData Types:\n")
print(df.dtypes)
print("\nMissing Values:\n")
print(df.isnull().sum())


Shape of dataset: (545, 13)

Data Types:

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

Missing Values:

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


# Part 1 – Data Loading & Preprocessing

In this section, we prepare the dataset for training.
This includes:
- Encoding categorical variables
- Splitting into training and testing sets
- Scaling features
- Preventing information leakage


In [47]:
def encode_categorical(df):
    """
    Converts categorical columns into numeric using one-hot encoding.
    drop_first=True avoids dummy variable trap.
    """
    df_encoded = pd.get_dummies(df, drop_first=True)
    return df_encoded


df_encoded = encode_categorical(df)
df_encoded.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,True,True,False,True,False,False,False


In [48]:
np.random.seed(42)

shuffled_indices = np.random.permutation(len(df_encoded))
test_size = int(len(df_encoded) * 0.2)

test_indices = shuffled_indices[:test_size]
train_indices = shuffled_indices[test_size:]

train_df = df_encoded.iloc[train_indices]
test_df = df_encoded.iloc[test_indices]

X_train = train_df.drop("price", axis=1).to_numpy()
y_train = train_df["price"].to_numpy()

X_test = test_df.drop("price", axis=1).to_numpy()
y_test = test_df["price"].to_numpy()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (436, 13)
y_train shape: (436,)


In [49]:
def encode_categorical(df):
    """
    Converts categorical columns into numeric using one-hot encoding.
    Ensures all columns are numeric.
    """
    df_encoded = pd.get_dummies(df, drop_first=True)

    # Force all columns to numeric type
    df_encoded = df_encoded.apply(pd.to_numeric)

    return df_encoded


df_encoded = encode_categorical(df)
df_encoded.dtypes


Unnamed: 0,0
price,int64
area,int64
bedrooms,int64
bathrooms,int64
stories,int64
parking,int64
mainroad_yes,bool
guestroom_yes,bool
basement_yes,bool
hotwaterheating_yes,bool


# Part 2 – Linear Model

In this section, we define the mathematical structure of Linear Regression.

The prediction formula is:

ŷ = XW + b

Where:
- X is the feature matrix
- W is the vector of weights (coefficients)
- b is the bias term
- ŷ is the predicted output


In [50]:
# Part 2 – Linear Model

def predict(X, weights, bias):
    return np.dot(X, weights) + bias


In [51]:
print(X_train.shape)
print(y_train.shape)


(436, 13)
(436,)


# Part 3 – Training Using Gradient Descent

We train the linear regression model by:

1. Defining the Mean Squared Error (MSE) loss function
2. Computing gradients of the loss with respect to weights and bias
3. Updating parameters iteratively to minimize the loss


In [52]:
def compute_mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)


def train_linear_regression(X, y, learning_rate=0.000001, epochs=500):

    # Force numeric safety
    X = np.array(X, dtype=float)
    y = np.array(y, dtype=float)

    n_samples, n_features = X.shape

    weights = np.zeros(n_features)
    bias = 0.0

    for epoch in range(epochs):

        y_pred = np.dot(X, weights) + bias

        dw = (-2 / n_samples) * np.dot(X.T, (y - y_pred))
        db = (-2 / n_samples) * np.sum(y - y_pred)

        weights -= learning_rate * dw
        bias -= learning_rate * db

        if epoch % 100 == 0:
            loss = compute_mse(y, y_pred)
            print("Epoch:", epoch, "Loss:", loss)

    return weights, bias


In [53]:
weights, bias = train_linear_regression(X_train, y_train)

print("Training complete")


Epoch: 0 Loss: 25234792406487.613
Epoch: 100 Loss: inf
Epoch: 200 Loss: nan
Epoch: 300 Loss: nan
Epoch: 400 Loss: nan
Training complete


  return np.mean((y_true - y_pred) ** 2)
  weights -= learning_rate * dw


In [54]:
print(X_train.shape)


(436, 13)


In [60]:
def compute_mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)


def train_linear_regression(X, y, learning_rate=0.01, epochs=1000):

    X = np.array(X, dtype=float)
    y = np.array(y, dtype=float)

    n_samples, n_features = X.shape

    weights = np.zeros(n_features)
    bias = 0.0

    for epoch in range(epochs):

        y_pred = np.dot(X, weights) + bias

        dw = (-2 / n_samples) * np.dot(X.T, (y - y_pred))
        db = (-2 / n_samples) * np.sum(y - y_pred)

        weights -= learning_rate * dw
        bias -= learning_rate * db

        if epoch % 100 == 0:
            loss = compute_mse(y, y_pred)
            print(f"Epoch {epoch}, Loss: {loss}")

    return weights, bias


In [61]:
weights, bias = train_linear_regression(X_train, y_train)

print("Training complete")


Epoch 0, Loss: 25234792406487.613
Epoch 100, Loss: nan
Epoch 200, Loss: nan
Epoch 300, Loss: nan
Epoch 400, Loss: nan
Epoch 500, Loss: nan
Epoch 600, Loss: nan
Epoch 700, Loss: nan
Epoch 800, Loss: nan
Epoch 900, Loss: nan
Training complete


  weights -= learning_rate * dw


# Part 4 – Model Evaluation

After training the model, we evaluate its performance on unseen test data using Mean Squared Error (MSE).

This helps us measure how well the model generalizes.


In [62]:
def evaluate_model(X, y, weights, bias):
    y_pred = np.dot(X, weights) + bias
    mse = np.mean((y - y_pred) ** 2)
    return mse


test_mse = evaluate_model(X_test, y_test, weights, bias)

print("Test MSE:", test_mse)


Test MSE: nan


In [63]:
# Compare first 5 predictions
y_pred_sample = np.dot(X_test[:5], weights) + bias

print("Predicted:", y_pred_sample)
print("Actual:", y_test[:5])


Predicted: [nan nan nan nan nan]
Actual: [4060000 6650000 3710000 6440000 2800000]
