In [1]:
from typing import List, Tuple

In [2]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
train_data = pd.read_csv("dataset/train.csv")
test_data = pd.read_csv("dataset/test.csv")
y_col = "SalePrice"
id_col = "Id"

In [4]:
def random_split_on_data(data: pd.DataFrame, train_size: float = 0.8) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split the data passed into two chunks using `train_size` as proportion
    """
    split = np.random.rand(len(data)) < train_size
    train = data[split]
    test = data[~split]
    return train, test

In [5]:
def split_dataset_supervised(data: pd.DataFrame, train_cols: List[str], y_col: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split the dataset into X (to calculate the output) and Y (to calculate the error)
    """
    temp = data.copy(deep=True)
    X = temp[train_cols]
    Y = temp[[y_col]]
    return X, Y

In [6]:
def calculate_rmse(lr_model: LinearRegression, X: pd.DataFrame, Y: pd.DataFrame) -> float:
    """
    Calculates RMSE of calculated preditions
    """
    Y_pred = lr_model.predict(X)
    return mean_squared_error(Y, Y_pred, squared=False)

In [7]:
def save_model(model, model_name):
    with open(model_name, "wb") as mw:
        pickle.dump(model, mw)

In [8]:
def get_submission_file(model, data, y_col, id_col, pred_cols, output_file):
    pred = data.copy(deep=True)
    pred[y_col] = model.predict(pred[pred_cols])
    pred = pred[[id_col, y_col]]
    pred.to_csv(output_file, index=False)

## Training with more numerical features

In [28]:
features = ["LotArea", "LotFrontage", "OverallQual", "OverallCond", "MasVnrArea", "1stFlrSF", "2ndFlrSF", "GarageArea"]
cols = features + [y_col]
train_split = 0.7

In [29]:
training = train_data.copy(deep=True)[cols]
print(f"Before drop NA: {training.shape}")
training = training.fillna(0)
print(f"After drop NA: {training.shape}")
training.head(2)

Before drop NA: (1460, 9)
After drop NA: (1460, 9)


Unnamed: 0,LotArea,LotFrontage,OverallQual,OverallCond,MasVnrArea,1stFlrSF,2ndFlrSF,GarageArea,SalePrice
0,8450,65.0,7,5,196.0,856,854,548,208500
1,9600,80.0,6,8,0.0,1262,0,460,181500


In [30]:
model, rmse = None, (2 ** 32)

In [31]:
for i in range(1000):
    train, test = random_split_on_data(training, train_split)
    train_x, train_y = split_dataset_supervised(train, features, "SalePrice")
    test_x, test_y = split_dataset_supervised(test, features, "SalePrice")

    lr_model = LinearRegression().fit(train_x, train_y)

    # RMSE
    test_rmse = calculate_rmse(lr_model, test_x, test_y)

    if test_rmse < rmse:
        rmse = test_rmse
        model = lr_model

print(f"RMSE: {rmse}")

RMSE: 28386.873645797088


In [25]:
# save_model(model, "models/linear_regression_multiple_numerical_features.scikit_model")
save_model(model, "models/linear_regression_multiple_numerical_features_fillna.scikit_model")

In [27]:
testing = test_data.copy(deep=True)[[id_col] + features]
testing = testing.fillna(0)
get_submission_file(model, testing, y_col, id_col, features, "dataset/linear_regression_multiple_numerical_features_fillna.csv")