In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression

from common_libraries import dataset, models

In [2]:
def get_submission_file(model, data, y_col, id_col, pred_cols, output_file):
    pred = data.copy(deep=True)
    pred[y_col] = model.predict(pred[pred_cols])
    pred = pred[[id_col, y_col]]
    pred.to_csv(output_file, index=False)

In [3]:
train_data = pd.read_csv("dataset/train.csv")
test_data = pd.read_csv("dataset/test.csv")

In [4]:
y_col = "SalePrice"
id_col = "Id"
pre_features = ["LotArea", "Neighborhood"]
train_split = .7
post_features = []

In [5]:
trainset = train_data.copy(deep=True)[[id_col] + pre_features + [y_col]]
neighborhood_encoder = dataset.fit_categorical_encoder(trainset, "Neighborhood")
trainset = dataset.transform_with_fitted_encoder(trainset, "Neighborhood", neighborhood_encoder)
post_features = list(set(trainset.columns) - set([id_col, y_col]))
print(post_features)
trainset.head(2)

['Sawyer', 'Veenker', 'NWAmes', 'NridgHt', 'CollgCr', 'IDOTRR', 'ClearCr', 'Edwards', 'SawyerW', 'Blmngtn', 'LotArea', 'NPkVill', 'Mitchel', 'Somerst', 'Blueste', 'BrDale', 'OldTown', 'Crawfor', 'StoneBr', 'Timber', 'BrkSide', 'NoRidge', 'Gilbert', 'MeadowV', 'SWISU', 'NAmes']


Unnamed: 0,Id,LotArea,SalePrice,Blmngtn,Blueste,BrDale,BrkSide,ClearCr,CollgCr,Crawfor,...,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker
0,1,8450,208500,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,9600,181500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Dealing with features

- Fill missing values with some value depending on the feature
- Transform data that can have a numerical representation (such as Low, Medium and High_
- Transform categorical data into arrays (Street: Grvl -> `[0, 1]` -> 0 on Pave and 1 on Gravel)

In [16]:
features = post_features

train, test = dataset.random_split_on_data(trainset, train_split)
train_x, train_y = dataset.split_dataset_supervised(train, features, y_col)
test_x, test_y = dataset.split_dataset_supervised(test, features, y_col)

lr_model = LinearRegression().fit(train_x, train_y)
rmse = models.calculate_rmse_log(lr_model, test_x, test_y)

print(f"RMSE - using log: {rmse}")

RMSE - using log: 4.445460097286023


In [13]:
models.save(lr_model, "models/linear_regression_neighborhood.scikit_model")

In [14]:
# I must use the encoder created using the training data, but I'll allow myself this shortcut for the moment
testset = test_data.copy(deep=True)[[id_col] + pre_features]
testset = dataset.transform_with_fitted_encoder(testset, "Neighborhood", neighborhood_encoder)
testset.head(2)

Unnamed: 0,Id,LotArea,Blmngtn,Blueste,BrDale,BrkSide,ClearCr,CollgCr,Crawfor,Edwards,...,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker
0,1461,11622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1462,14267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
get_submission_file(lr_model, testset, y_col, id_col, features, "dataset/linear_regression_neighborhood.csv")