# House Price Prediction 

Writer: Zihang WANG_AIS

Date: 06/03/2024

Envrionment: ~/requirements.txt

## Abstact

This project is for DSP(2024 Spring) only, more information about the dataset and background [here](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data).  It aims to create a model to predict the saleprices of the houses by following these steps: 

Data setup(load, train and test sets split, etc)

Feature processing: process, scale and encode the different features

Model training

Model evaluation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
import joblib

warnings.filterwarnings("ignore")
%load_ext lab_black

## Model Training

### Import Data

In [2]:
data_train_raw = pd.read_csv("data/housing_price_train.csv", index_col="Id")
data_train = data_train_raw.copy()

In [3]:
# Train Checking
data_train.shape

(1460, 80)

In [4]:
data_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


### Train/Test Split

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [6]:
"""
Train/Test split

X: all columns from train_data except SalePrice
y: SalePrice
"""
X, y = data_train.loc[:, data_train.columns != "SalePrice"], data_train["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
for data in [X_train, X_test, y_train, y_test]:
    data.reset_index(drop=True, inplace=True)

In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1022, 79), (1022,), (438, 79), (438,))

### Preprocessing

#### Continuous Column

In [8]:
# get_continuous_columns from the dataframe
def get_continuous_columns(df: pd.DataFrame) -> pd.Series:
    return df.select_dtypes(include="number").columns

In [9]:
# fill the missing data
def fillna_continuous(df: pd.DataFrame) -> pd.DataFrame:
    columns = get_continuous_columns(df)
    [df[column].fillna(0, inplace=True) for column in columns]
    return df

#### Categorical Columns

In [10]:
# get_continuous_columns from the dataframe
def get_categorical_columns(df: pd.DataFrame) -> pd.Series:
    return df.select_dtypes(include="object").columns

In [11]:
# fill the missing data
def fillna_categorical(df: pd.DataFrame) -> pd.DataFrame:
    columns = get_categorical_columns(df)
    [df[column].fillna("Unknown", inplace=True) for column in columns]
    return df

#### Encode categorical columns

In [12]:
from sklearn.preprocessing import OneHotEncoder


# Create encoder
def make_encoder(df: pd.DataFrame) -> OneHotEncoder:
    path = "/Users/ericwindsor/Documents/EPITA_ERIC/Data_Scicence_Production/dsp-zihang-wang/models/"
    categorical_columns = get_categorical_columns(df)
    encoder = OneHotEncoder(handle_unknown="ignore", dtype=int)
    encoder.fit(df[categorical_columns])
    encoder_name = "encoder.OneHotEncoder"
    encoder_path = path + encoder_name
    joblib.dump(encoder, encoder_path)
    return encoder, encoder_path


# Encode the categorial columns
def encode_categorical(df: pd.DataFrame, encoder: [OneHotEncoder]) -> pd.DataFrame:
    categorical_columns = get_categorical_columns(df)
    encoded_columns = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(
        encoded_columns.toarray(),
        columns=encoder.get_feature_names_out(categorical_columns),
    )
    df = df.drop(categorical_columns, axis=1).join(encoded_df)
    return df

#### Encapsulation

In [13]:
# Encapusulate functions of all processes
def process_data(df: pd.DataFrame) -> pd.DataFrame:
    df = fillna_continuous(df)
    df = fillna_categorical(df)

### Model Training

In [14]:
# Train data processing
encoder, encoder_path = make_encoder(X_train)
X_train = encode_categorical(X_train, encoder)
process_data(X_train)
# Test data processing
X_test = encode_categorical(X_test, encoder)
process_data(X_test)

In [15]:
from sklearn.metrics import mean_squared_log_error


def build_model(
    X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: np.ndarray, y_test: np.ndarray
) -> dict[str, str]:
    # Returns a dictionary with the model performances (for example {'rmse': 0.18})
    model = LinearRegression()
    model.fit(X_train, y_train)
    path = "/Users/ericwindsor/Documents/EPITA_ERIC/Data_Scicence_Production/dsp-zihang-wang/models/"
    model_name = "lreg.model"
    model_path = path + model_name
    joblib.dump(model, model_path)
    y_pred = abs(model.predict(X_test))
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))

    return {"rmsle": round(rmsle, 2), "model_path": model_path}

### Model evaluation

In [16]:
accuray, model_path = build_model(X_train, X_test, y_train, y_test).values()
accuray

0.29

## Model inference

In [17]:
data_test_raw = pd.read_csv("data/housing_price_test.csv", index_col="Id")
data_test = data_test_raw.copy()

In [18]:
def make_predictions(
    data_test: pd.DataFrame, encoder_path: str, model_path: str
) -> np.ndarray:
    # the model and all the data preparation objects (encoder, etc) should be loaded from the models folder
    encoder_train = joblib.load(encoder_path)
    data_test = encode_categorical(data_test, encoder_train)
    process_data(data_test)
    joblib_model = joblib.load(model_path)
    tesr_pred = abs(joblib_model.predict(data_test))
    return tesr_pred

In [19]:
make_predictions(data_test, encoder_path, model_path)

array([106316.53150199,   6471.77076516,  43276.91487479, ...,
        58066.13615507,  18766.96128885,    709.92067789])