# House Price Prediction (Step0-Step2)

Writer: Zihang WANG_AIS

Date: 06/03/2024

Envrionment: ~/requirements.txt

## Abstact

This project is for DSP(2024 Spring) only, more information about the dataset and background [here](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data).  It aims to create a model to predict the saleprices of the houses by following these steps: 

Data setup(load, train and test sets split, etc)

Feature processing: process, scale and encode the different features

Model training

Model evaluation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
import joblib

warnings.filterwarnings("ignore")
%load_ext lab_black

## Model Training

### Import Data

In [2]:
data_train_raw = pd.read_csv("data/housing_price_train.csv", index_col="Id")
data_train = data_train_raw.copy()

In [3]:
# Train Checking
data_train.shape

(1460, 80)

In [4]:
data_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


### Train/Test Split

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [6]:
"""
Train/Test split

X: all columns from train_data except SalePrice
y: SalePrice
"""
X, y = data_train.loc[:, data_train.columns != "SalePrice"], data_train["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
for data in [X_train, X_test, y_train, y_test]:
    data.reset_index(drop=True, inplace=True)

In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1022, 79), (1022,), (438, 79), (438,))

### Preprocessing

#### Continuous Column

In [8]:
# get_continuous_columns from the dataframe
def get_continuous_columns(df: pd.DataFrame) -> pd.Series:
    return df.select_dtypes(include="number").columns

In [9]:
# fill the missing data
def fillna_continuous(df: pd.DataFrame) -> pd.DataFrame:
    columns = get_continuous_columns(df)
    [df[column].fillna(0, inplace=True) for column in columns]
    return df

#### Categorical Columns

In [10]:
# get_continuous_columns from the dataframe
def get_categorical_columns(df: pd.DataFrame) -> pd.Series:
    return df.select_dtypes(include="object").columns

In [11]:
# fill the missing data
def fillna_categorical(df: pd.DataFrame) -> pd.DataFrame:
    columns = get_categorical_columns(df)
    [df[column].fillna("Unknown", inplace=True) for column in columns]
    return df

#### Encode categorical columns

In [12]:
from sklearn.preprocessing import OneHotEncoder


# Create encoder
def make_encoder(df: pd.DataFrame) -> OneHotEncoder:
    path = "/Users/ericwindsor/Documents/EPITA_ERIC/Data_Scicence_Production/dsp-zihang-wang/models/"
    categorical_columns = get_categorical_columns(df)
    encoder = OneHotEncoder(handle_unknown="ignore", dtype=int)
    encoder.fit(df[categorical_columns])
    encoder_name = "encoder.OneHotEncoder"
    encoder_path = path + encoder_name
    joblib.dump(encoder, encoder_path)
    return encoder, encoder_path


# Encode the categorial columns
def encode_categorical(df: pd.DataFrame, encoder: [OneHotEncoder]) -> pd.DataFrame:
    categorical_columns = get_categorical_columns(df)
    encoded_columns = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(
        encoded_columns.toarray(),
        columns=encoder.get_feature_names_out(categorical_columns),
    )
    df = df.drop(categorical_columns, axis=1).join(encoded_df)
    return df

#### Encapsulation

In [13]:
# Encapusulate functions of all processes
def process_data(df: pd.DataFrame) -> pd.DataFrame:
    df = fillna_continuous(df)
    df = fillna_categorical(df)
    return df

### Model Training

In [14]:
# Train data processing
encoder, encoder_path = make_encoder(X_train)
X_train = encode_categorical(X_train, encoder)
process_data(X_train)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,0.0,9375,7,5,1997,1998,573.0,739,0,...,0,0,0,1,0,0,0,0,1,0
1,120,0.0,2887,6,5,1996,1997,0.0,1003,0,...,0,0,0,1,0,0,0,0,1,0
2,20,50.0,7207,5,7,1958,2008,0.0,696,0,...,0,0,0,1,0,0,0,0,1,0
3,50,60.0,9060,6,5,1939,1950,0.0,204,0,...,0,0,0,1,0,0,0,0,1,0
4,30,60.0,8400,2,5,1920,1950,0.0,290,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,60,82.0,9430,8,5,1999,1999,673.0,1163,0,...,0,0,0,1,0,0,0,0,1,0
1018,20,60.0,9600,4,7,1950,1995,0.0,442,0,...,0,0,0,1,0,0,0,0,1,0
1019,90,68.0,8930,6,5,1978,1978,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1020,120,0.0,3196,7,5,2003,2004,18.0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [15]:
model = LinearRegression()
model.fit(X_train, y_train)

In [16]:
path = "/Users/ericwindsor/Documents/EPITA_ERIC/Data_Scicence_Production/dsp-zihang-wang/models/"
model_name = "lreg.model"
model_path = path + model_name
joblib.dump(model, model_path)

['/Users/ericwindsor/Documents/EPITA_ERIC/Data_Scicence_Production/dsp-zihang-wang/models/lreg.model']

### Model evaluation

In [17]:
# Test data processing
X_test = encode_categorical(X_test, encoder)
process_data(X_test)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,20,0.0,32668,6,3,1957,1975,0.0,1219,0,...,0,0,0,1,0,0,1,0,0,0
1,50,79.0,9490,6,7,1941,1950,0.0,403,165,...,0,0,0,1,0,0,0,0,1,0
2,50,0.0,7015,5,4,1950,1950,161.0,185,0,...,0,0,0,1,0,0,0,0,1,0
3,60,83.0,10005,7,5,1977,1977,299.0,392,0,...,0,0,0,1,0,0,0,0,1,0
4,160,21.0,1680,6,5,1971,1971,381.0,0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,20,73.0,39104,7,7,1954,2005,0.0,226,1063,...,0,0,0,1,0,0,0,0,1,0
434,20,73.0,9855,6,5,1956,1956,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
435,20,91.0,10437,8,6,1995,1995,660.0,1696,0,...,0,0,0,1,0,0,0,0,1,0
436,20,67.0,9808,7,5,2002,2002,110.0,788,0,...,0,0,0,1,0,0,0,0,1,0


In [18]:
y_pred = abs(model.predict(X_test))

In [19]:
from sklearn.metrics import mean_squared_log_error


def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [20]:
compute_rmsle(y_test, y_pred)

0.29

## Model inference

In [21]:
data_test_raw = pd.read_csv("data/housing_price_test.csv", index_col="Id")
data_test = data_test_raw.copy()

In [22]:
data_test.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [23]:
encoder_train = joblib.load(encoder_path)
data_test = encode_categorical(data_test, encoder_train)
process_data(data_test)

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,21.0,1936,4,7,1970,1970,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2916,160,21.0,1894,4,5,1970,1970,0.0,252.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2917,20,160.0,20000,5,7,1960,1996,0.0,1224.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2918,85,62.0,10441,5,5,1992,1992,0.0,337.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
joblib_model = joblib.load(model_path)
tesr_pred = abs(joblib_model.predict(data_test))

In [25]:
tesr_pred[:5]

array([106316.53150199,   6471.77076516,  43276.91487479,  36059.47692577,
        66941.94922562])