In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [3]:
data = pd.read_csv("/content/imports-85.data", header=None, na_values="?")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [4]:
data.loc[:, data.dtypes == "float64"] = data.loc[:, data.dtypes == "float64"].fillna(
    data.loc[:, data.dtypes == "float64"].mean(axis=0)
)
data.loc[:, data.dtypes == "object"] = data.loc[:, data.dtypes == "object"].fillna("")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [5]:
data_encoded = pd.get_dummies(data)

In [6]:
data_encoded.columns = data_encoded.columns.astype(str)
X = data_encoded.drop(columns=["25"])
y = data_encoded["25"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)

In [9]:
scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
class MyLinearRegression:
    def __init__(self, lr=0.001, num_iter=10000, loss='mse', delta=1.0) -> None:

        self.lr = lr
        self.num_iter = num_iter
        self.loss_type = loss
        self.delta = delta

    def calculate_loss(self, y, y_pred):
        if self.loss_type == 'mse':
            return ((y - y_pred) ** 2).mean()
        elif self.loss_type == 'huber':
            error = y - y_pred
            is_small_error = np.abs(error) <= self.delta

            loss = np.where(
                is_small_error,
                0.5 * error**2,
                self.delta * (np.abs(error) - 0.5 * self.delta)
            )
            return loss.mean()
        else:
            raise ValueError(f"Unknown loss function: {self.loss_type}")

    def calculate_dldw(self, y, y_pred, x):
        if self.loss_type == 'mse':
            error = y - y_pred
            grad_w = (-2 * (x.T * error).T).mean(axis=0)
            return grad_w

        elif self.loss_type == 'huber':
            error = y - y_pred
            gradient_per_point = np.where(
                np.abs(error) <= self.delta,
                -error,
                -self.delta * np.sign(error)
            )

            grad_w = (x.T * gradient_per_point).T.mean(axis=0)
            return grad_w

    def calculate_dldb(self, y, y_pred):
        if self.loss_type == 'mse':
            grad_b = (-2 * (y - y_pred)).mean()
            return grad_b

        elif self.loss_type == 'huber':
            error = y - y_pred

            gradient_per_point = np.where(
                np.abs(error) <= self.delta,
                -error,
                -self.delta * np.sign(error)
            )

            grad_b = gradient_per_point.mean()
            return grad_b

    def fit(self, X_train, y_train, verbose=False):
        X_train = np.array(X_train)
        y_train = np.array(y_train)

        self.weights = np.random.rand(X_train.shape[1])
        self.b = 0

        self.loss_history = []

        for i in range(self.num_iter):
            pred = (X_train @ self.weights) + self.b
            loss = self.calculate_loss(y_train, pred)
            self.loss_history.append(loss)

            grad_w = self.calculate_dldw(y_train, pred, X_train)
            grad_b = self.calculate_dldb(y_train, pred)

            self.weights -= self.lr * grad_w
            self.b -= self.lr * grad_b

            if verbose and i % 1000 == 0:
                print(f"Iteration {i}: Loss = {loss:.6f}")

    def predict(self, X_test):
        X_test = np.array(X_test)
        return X_test @ self.weights + self.b

    def get_params(self):
        return {
            'weights': self.weights,
            'bias': self.b,
            'loss': self.loss_type,
            'delta': self.delta if self.loss_type == 'huber' else None
        }

In [11]:
my_model = MyLinearRegression(loss = 'huber', delta = 10000)
my_model.fit(X_train, y_train)
preds = my_model.predict(X_test)
print(
    f"""
    MSE_test = {metrics.mean_squared_error(preds, y_test)},
    MAE_test = {metrics.mean_absolute_error(preds, y_test)}
    r2_test = {metrics.r2_score(y_test, preds)}
"""
)


    MSE_test = 4431010.8312986,
    MAE_test = 1688.5379091595498
    r2_test = 0.9441563663650552

