In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
sns.set_theme(style="darkgrid")

In [3]:
train_data = pd.read_csv("dataset/train.csv")
test_data = pd.read_csv("dataset/test.csv")
output_col = "SalePrice"
id_col = "Id"

In [4]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
def separate_x_and_y(df: pd.DataFrame, y_col: str):
    x, y = df.copy(deep=True), df.copy(deep=True)
    del x[y_col]
    y = y[y_col]
    return x, y

In [6]:
def random_split_data(df: pd.DataFrame, size_threshold: float):
    split = np.random.rand(len(df)) < size_threshold
    train = train_data[split]
    test = train_data[~split]
    return train, test

In [7]:
def rmse(mdl: LinearRegression, X, y):
    pred = mdl.predict(X)
    return mean_squared_error(y, pred, squared=False)

In [8]:
def save_model(model, model_name):
    with open(model_name, "wb") as mw:
        pickle.dump(model, mw)

In [9]:
def get_submission_file(model, data, y_col, id_col, pred_cols, output_file):
    pred = data.copy(deep=True)
    pred[y_col] = model.predict(pred[pred_cols])
    pred = pred[[id_col, y_col]]
    pred.to_csv(output_file, index=False)

### Training with LotArea only

In [25]:
# Best model out of several runs
win_model, min_rmse = None, (2 ** 32)

In [26]:
# Training with LotArea only
for i in range(1000):
    for s in [.8, .7]:
        train, test = random_split_data(train_data, s)
        train_x, train_y = separate_x_and_y(train, "SalePrice")
        test_x, test_y = separate_x_and_y(test, "SalePrice")

        lr_model = LinearRegression().fit(train_x[["LotArea"]], train_y)

        # RMSE
        test_rmse = rmse(lr_model, test_x[["LotArea"]], test_y)

        if test_rmse < min_rmse:
            min_rmse = test_rmse
            win_model = lr_model

print(f"Winner model has {min_rmse} RMSE")

Winner model has 59972.02705792874 RMSE


In [31]:
# RMSE
save_model(win_model, "models/linear_regression_lot_area.scikit_model")

In [32]:
# Save this model run
get_submission_file(win_model, test_data, "SalePrice", "Id", ["LotArea"], "dataset/submission_linear_regression_lot_area.csv")

In [29]:
str(lr_model)

'LinearRegression()'