In [14]:
import numpy as np
import pandas as pd

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
from typing import Optional, List

In [16]:
data = pd.read_csv('./data.csv')

In [17]:
seed = 24
target_column = "Sale_Price"
np.random.seed(seed)

test_size = 0.2
data_train, data_test, Y_train, Y_test = train_test_split(
    data[data.columns.drop("Sale_Price")],
    np.array(data["Sale_Price"]),
    test_size=test_size,
    random_state=seed)
print(f"Train : {data_train.shape} {Y_train.shape}")
print(f"Test : {data_test.shape} {Y_test.shape}")

Train : (2344, 80) (2344,)
Test : (586, 80) (586,)


In [18]:
continuous_columns = [key for key in data.keys() if data[key].dtype in ("int64", "float64")]
categorical_columns = [key for key in data.keys() if data[key].dtype == "object"]
continuous_columns.remove(target_column)


In [19]:
class BaseDataPreprocessor(TransformerMixin):
    def __init__(self, needed_columns: Optional[List[str]]=None):
        """
        :param needed_columns: if not None select these columns from the dataframe
        """
        self.scaler = StandardScaler()
        if needed_columns:
            self.needed_columns = needed_columns
        else:
            self.needed_columns = None

    def fit(self, data, *args):
        """
        Prepares the class for future transformations
        :param data: pd.DataFrame with all available columns
        :return: self
        """
        if self.needed_columns:
            data = data[self.needed_columns]
        self.scaler.fit(data)
        return self

    def transform(self, data: pd.DataFrame) -> np.array:
        """
        Transforms features so that they can be fed into the regressors
        :param data: pd.DataFrame with all available columns
        :return: np.array with preprocessed features
        """
        if self.needed_columns:
            data = data[self.needed_columns]
        data = self.scaler.transform(data)
        return np.array(data)

In [20]:
preprocessor = BaseDataPreprocessor(needed_columns=continuous_columns)
X_train = preprocessor.fit_transform(data_train)
X_test = preprocessor.transform(data_test)

In [102]:
X_train.shape

(2344, 34)

In [99]:
from sklearn.base import RegressorMixin

In [137]:
class SGDLinearRegressor(RegressorMixin):
    def __init__(self,
                 lr=0.01, regularization=1., delta_converged=1e-3, max_steps=1000,
                 batch_size=64):
        self.lr = lr
        self.regularization = regularization
        self.max_steps = max_steps
        self.delta_converged = delta_converged
        self.batch_size = batch_size

        self.W = None
        self.b = None

    def fit(self, X, Y):
        self.X_shape = X.shape
        # create random parameters
        self.W = np.random.normal(size = self.X_shape[1])
        self.W_pred = self.W
        self.b = np.random.normal()
        for i in range(self.max_steps):
            self.curr_norm_W = np.linalg.norm(self.W)
            for j in range(X.shape[0]//self.batch_size):
                ## Create batch_X and batch_Y
                batch_X = X[j*self.batch_size:(j+1)*self.batch_size]
                batch_Y = Y[j*self.batch_size:(j+1)*self.batch_size]
                ## predictions
                f = batch_X.dot(self.W)+self.b
                ## error
                err = f - batch_Y
                ## gradient
                grad_W = 2/self.batch_size*batch_X.T.dot(err) + 2*self.regularization*self.W
                grad_b = 2/self.batch_size*err.sum()
                # change parameters
                self.W = self.W - self.lr*grad_W
                self.b = self.b - self.lr*grad_b
                # concatenate and shuffle
                X_Y = np.column_stack([X,Y])
                np.random.shuffle(X_Y)
                # reconstruction X and Y after shuffle
                X = X_Y[:,:self.X_shape[1]]
                Y = X_Y[:,self.X_shape[1]]
            # check stop condition
            if np.linalg.norm(self.W-self.W_pred)<self.delta_converged:
                break
    def predict(self, X):
        return X.dot(self.W)+self.b

In [138]:
model = SGDLinearRegressor(max_steps=10)
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
print(Y_test.shape, prediction.shape)

(586,) (586,)


In [121]:
from sklearn.metrics import mean_absolute_error
def root_mean_squared_logarithmic_error(y_true, y_pred, a_min=1.):
    y_pred = (y_pred>a_min)*y_pred+(y_pred<a_min)*a_min
    return (((np.log(y_true)-np.log(y_pred))**2).sum()/y_true.shape[0])**0.5

In [136]:
print("MAE : ", mean_absolute_error(Y_test, prediction))
print("Mean log : ", root_mean_squared_logarithmic_error(Y_test, prediction))

MAE :  25514.25867272621
Mean log :  0.18684930212416273


In [123]:
from sklearn.linear_model import LinearRegression

In [125]:
model = LinearRegression()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
print(Y_test.shape, prediction.shape)
print("MAE : ", mean_absolute_error(Y_test, prediction))
print("Mean log : ", root_mean_squared_logarithmic_error(Y_test, prediction))

(586,) (586,)
MAE :  23825.06601165691
Mean log :  0.1951128267485825
